Message ID | 20240406173649.3210836-3-vdonnefort@google.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | None | expand |
On Sat, 6 Apr 2024 18:36:46 +0100 Vincent Donnefort <vdonnefort@google.com> wrote: > +int ring_buffer_map(struct trace_buffer *buffer, int cpu, > + struct vm_area_struct *vma) > +{ > + struct ring_buffer_per_cpu *cpu_buffer; > + unsigned long flags, *subbuf_ids; > + int err = 0; > + > + if (!cpumask_test_cpu(cpu, buffer->cpumask)) > + return -EINVAL; > + > + cpu_buffer = buffer->buffers[cpu]; > + > + mutex_lock(&cpu_buffer->mapping_lock); > + > + if (cpu_buffer->mapped) { > + err = __rb_map_vma(cpu_buffer, vma); > + if (!err) > + err = __rb_inc_dec_mapped(cpu_buffer, true); > + mutex_unlock(&cpu_buffer->mapping_lock); > + return err; > + } > + > + /* prevent another thread from changing buffer/sub-buffer sizes */ > + mutex_lock(&buffer->mutex); > + > + err = rb_alloc_meta_page(cpu_buffer); > + if (err) > + goto unlock; > + > + /* subbuf_ids include the reader while nr_pages does not */ > + subbuf_ids = kcalloc(cpu_buffer->nr_pages + 1, sizeof(*subbuf_ids), GFP_KERNEL); > + if (!subbuf_ids) { > + rb_free_meta_page(cpu_buffer); > + err = -ENOMEM; > + goto unlock; > + } > + > + atomic_inc(&cpu_buffer->resize_disabled); > + > + /* > + * Lock all readers to block any subbuf swap until the subbuf IDs are > + * assigned. > + */ > + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); > + rb_setup_ids_meta_page(cpu_buffer, subbuf_ids); > + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); > + > + err = __rb_map_vma(cpu_buffer, vma); > + if (!err) { > + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); > + cpu_buffer->mapped = 1; > + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); > + } else { > + kfree(cpu_buffer->subbuf_ids); > + cpu_buffer->subbuf_ids = NULL; > + rb_free_meta_page(cpu_buffer); > + } > +unlock: Nit: For all labels, please add a space before them. Otherwise, diffs will show "unlock" as the function and not "ring_buffer_map", making it harder to find where the change is. Same for the labels below. -- Steve > + mutex_unlock(&buffer->mutex); > + mutex_unlock(&cpu_buffer->mapping_lock); > + > + return err; > +} > + > +int ring_buffer_unmap(struct trace_buffer *buffer, int cpu) > +{ > + struct ring_buffer_per_cpu *cpu_buffer; > + unsigned long flags; > + int err = 0; > + > + if (!cpumask_test_cpu(cpu, buffer->cpumask)) > + return -EINVAL; > + > + cpu_buffer = buffer->buffers[cpu]; > + > + mutex_lock(&cpu_buffer->mapping_lock); > + > + if (!cpu_buffer->mapped) { > + err = -ENODEV; > + goto out; > + } else if (cpu_buffer->mapped > 1) { > + __rb_inc_dec_mapped(cpu_buffer, false); > + goto out; > + } > + > + mutex_lock(&buffer->mutex); > + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); > + > + cpu_buffer->mapped = 0; > + > + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); > + > + kfree(cpu_buffer->subbuf_ids); > + cpu_buffer->subbuf_ids = NULL; > + rb_free_meta_page(cpu_buffer); > + atomic_dec(&cpu_buffer->resize_disabled); > + > + mutex_unlock(&buffer->mutex); > +out: > + mutex_unlock(&cpu_buffer->mapping_lock); > + > + return err; > +} > + > +int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu) > +{ > + struct ring_buffer_per_cpu *cpu_buffer; > + unsigned long reader_size; > + unsigned long flags; > + > + cpu_buffer = rb_get_mapped_buffer(buffer, cpu); > + if (IS_ERR(cpu_buffer)) > + return (int)PTR_ERR(cpu_buffer); > + > + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); > +consume: > + if (rb_per_cpu_empty(cpu_buffer)) > + goto out; > + > + reader_size = rb_page_size(cpu_buffer->reader_page); > + > + /* > + * There are data to be read on the current reader page, we can > + * return to the caller. But before that, we assume the latter will read > + * everything. Let's update the kernel reader accordingly. > + */ > + if (cpu_buffer->reader_page->read < reader_size) { > + while (cpu_buffer->reader_page->read < reader_size) > + rb_advance_reader(cpu_buffer); > + goto out; > + } > + > + if (WARN_ON(!rb_get_reader_page(cpu_buffer))) > + goto out; > + > + goto consume; > +out: > + /* Some archs do not have data cache coherency between kernel and user-space */ > + flush_dcache_folio(virt_to_folio(cpu_buffer->reader_page->page)); > + > + rb_update_meta_page(cpu_buffer); > + > + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); > + rb_put_mapped_buffer(cpu_buffer); > + > + return 0; > +} > + > /* > * We only allocate new buffers, never free them if the CPU goes down. > * If we were to free the buffer, then the user would lose any trace that was in
On Sat, Apr 06, 2024 at 06:36:46PM +0100, Vincent Donnefort wrote: > In preparation for allowing the user-space to map a ring-buffer, add > a set of mapping functions: > > ring_buffer_{map,unmap}() > > And controls on the ring-buffer: > > ring_buffer_map_get_reader() /* swap reader and head */ > > Mapping the ring-buffer also involves: > > A unique ID for each subbuf of the ring-buffer, currently they are > only identified through their in-kernel VA. > > A meta-page, where are stored ring-buffer statistics and a > description for the current reader > > The linear mapping exposes the meta-page, and each subbuf of the > ring-buffer, ordered following their unique ID, assigned during the > first mapping. > > Once mapped, no subbuf can get in or out of the ring-buffer: the buffer > size will remain unmodified and the splice enabling functions will in > reality simply memcpy the data instead of swapping subbufs. > > CC: <linux-mm@kvack.org> > Signed-off-by: Vincent Donnefort <vdonnefort@google.com> > > diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h > index dc5ae4e96aee..96d2140b471e 100644 > --- a/include/linux/ring_buffer.h > +++ b/include/linux/ring_buffer.h > @@ -6,6 +6,8 @@ > #include <linux/seq_file.h> > #include <linux/poll.h> > > +#include <uapi/linux/trace_mmap.h> > + > struct trace_buffer; > struct ring_buffer_iter; > > @@ -223,4 +225,8 @@ int trace_rb_cpu_prepare(unsigned int cpu, struct hlist_node *node); > #define trace_rb_cpu_prepare NULL > #endif > > +int ring_buffer_map(struct trace_buffer *buffer, int cpu, > + struct vm_area_struct *vma); > +int ring_buffer_unmap(struct trace_buffer *buffer, int cpu); > +int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu); > #endif /* _LINUX_RING_BUFFER_H */ > diff --git a/include/uapi/linux/trace_mmap.h b/include/uapi/linux/trace_mmap.h > new file mode 100644 > index 000000000000..ffcd8dfcaa4f > --- /dev/null > +++ b/include/uapi/linux/trace_mmap.h > @@ -0,0 +1,46 @@ > +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ > +#ifndef _TRACE_MMAP_H_ > +#define _TRACE_MMAP_H_ > + > +#include <linux/types.h> > + > +/** > + * struct trace_buffer_meta - Ring-buffer Meta-page description > + * @meta_page_size: Size of this meta-page. > + * @meta_struct_len: Size of this structure. > + * @subbuf_size: Size of each sub-buffer. > + * @nr_subbufs: Number of subbfs in the ring-buffer, including the reader. > + * @reader.lost_events: Number of events lost at the time of the reader swap. > + * @reader.id: subbuf ID of the current reader. ID range [0 : @nr_subbufs - 1] > + * @reader.read: Number of bytes read on the reader subbuf. > + * @flags: Placeholder for now, 0 until new features are supported. > + * @entries: Number of entries in the ring-buffer. > + * @overrun: Number of entries lost in the ring-buffer. > + * @read: Number of entries that have been read. > + * @Reserved1: Reserved for future use. > + * @Reserved2: Reserved for future use. > + */ > +struct trace_buffer_meta { > + __u32 meta_page_size; > + __u32 meta_struct_len; > + > + __u32 subbuf_size; > + __u32 nr_subbufs; > + > + struct { > + __u64 lost_events; > + __u32 id; > + __u32 read; > + } reader; > + > + __u64 flags; > + > + __u64 entries; > + __u64 overrun; > + __u64 read; > + > + __u64 Reserved1; > + __u64 Reserved2; Why do you need reserved fields? This structure always resides in the beginning of a page and the rest of the page is essentially "reserved". > +}; > + > +#endif /* _TRACE_MMAP_H_ */ > diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c > index cc9ebe593571..793ecc454039 100644 > --- a/kernel/trace/ring_buffer.c > +++ b/kernel/trace/ring_buffer.c ... > +static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer, > + unsigned long *subbuf_ids) > +{ > + struct trace_buffer_meta *meta = cpu_buffer->meta_page; > + unsigned int nr_subbufs = cpu_buffer->nr_pages + 1; > + struct buffer_page *first_subbuf, *subbuf; > + int id = 0; > + > + subbuf_ids[id] = (unsigned long)cpu_buffer->reader_page->page; > + cpu_buffer->reader_page->id = id++; > + > + first_subbuf = subbuf = rb_set_head_page(cpu_buffer); > + do { > + if (WARN_ON(id >= nr_subbufs)) > + break; > + > + subbuf_ids[id] = (unsigned long)subbuf->page; > + subbuf->id = id; > + > + rb_inc_page(&subbuf); > + id++; > + } while (subbuf != first_subbuf); > + > + /* install subbuf ID to kern VA translation */ > + cpu_buffer->subbuf_ids = subbuf_ids; > + > + /* __rb_map_vma() pads the meta-page to align it with the sub-buffers */ > + meta->meta_page_size = PAGE_SIZE << cpu_buffer->buffer->subbuf_order; Isn't this a single page? > + meta->meta_struct_len = sizeof(*meta); > + meta->nr_subbufs = nr_subbufs; > + meta->subbuf_size = cpu_buffer->buffer->subbuf_size + BUF_PAGE_HDR_SIZE; > + > + rb_update_meta_page(cpu_buffer); > +} ... > +#define subbuf_page(off, start) \ > + virt_to_page((void *)((start) + ((off) << PAGE_SHIFT))) > + > +#define foreach_subbuf_page(sub_order, start, page) \ Nit: usually iterators in kernel use for_each > + page = subbuf_page(0, (start)); \ > + for (int __off = 0; __off < (1 << (sub_order)); \ > + __off++, page = subbuf_page(__off, (start))) The pages are allocated with alloc_pages_node(.. subbuf_order) are physically contiguous and struct pages for them are also contiguous, so inside a subbuf_order allocation you can just do page++.
On Thu, 18 Apr 2024 09:55:55 +0300 Mike Rapoport <rppt@kernel.org> wrote: Hi Mike, Thanks for doing this review! > > +/** > > + * struct trace_buffer_meta - Ring-buffer Meta-page description > > + * @meta_page_size: Size of this meta-page. > > + * @meta_struct_len: Size of this structure. > > + * @subbuf_size: Size of each sub-buffer. > > + * @nr_subbufs: Number of subbfs in the ring-buffer, including the reader. > > + * @reader.lost_events: Number of events lost at the time of the reader swap. > > + * @reader.id: subbuf ID of the current reader. ID range [0 : @nr_subbufs - 1] > > + * @reader.read: Number of bytes read on the reader subbuf. > > + * @flags: Placeholder for now, 0 until new features are supported. > > + * @entries: Number of entries in the ring-buffer. > > + * @overrun: Number of entries lost in the ring-buffer. > > + * @read: Number of entries that have been read. > > + * @Reserved1: Reserved for future use. > > + * @Reserved2: Reserved for future use. > > + */ > > +struct trace_buffer_meta { > > + __u32 meta_page_size; > > + __u32 meta_struct_len; > > + > > + __u32 subbuf_size; > > + __u32 nr_subbufs; > > + > > + struct { > > + __u64 lost_events; > > + __u32 id; > > + __u32 read; > > + } reader; > > + > > + __u64 flags; > > + > > + __u64 entries; > > + __u64 overrun; > > + __u64 read; > > + > > + __u64 Reserved1; > > + __u64 Reserved2; > > Why do you need reserved fields? This structure always resides in the > beginning of a page and the rest of the page is essentially "reserved". So this code is also going to be used in arm's pkvm hypervisor code, where it will be using these fields, but since we are looking at keeping the same interface between the two, we don't want these used by this interface. We probably should add a comment about that. > > > +}; > > + > > +#endif /* _TRACE_MMAP_H_ */ > > diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c > > index cc9ebe593571..793ecc454039 100644 > > --- a/kernel/trace/ring_buffer.c > > +++ b/kernel/trace/ring_buffer.c > > ... > > > +static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer, > > + unsigned long *subbuf_ids) > > +{ > > + struct trace_buffer_meta *meta = cpu_buffer->meta_page; > > + unsigned int nr_subbufs = cpu_buffer->nr_pages + 1; > > + struct buffer_page *first_subbuf, *subbuf; > > + int id = 0; > > + > > + subbuf_ids[id] = (unsigned long)cpu_buffer->reader_page->page; > > + cpu_buffer->reader_page->id = id++; > > + > > + first_subbuf = subbuf = rb_set_head_page(cpu_buffer); > > + do { > > + if (WARN_ON(id >= nr_subbufs)) > > + break; > > + > > + subbuf_ids[id] = (unsigned long)subbuf->page; > > + subbuf->id = id; > > + > > + rb_inc_page(&subbuf); > > + id++; > > + } while (subbuf != first_subbuf); > > + > > + /* install subbuf ID to kern VA translation */ > > + cpu_buffer->subbuf_ids = subbuf_ids; > > + > > + /* __rb_map_vma() pads the meta-page to align it with the sub-buffers */ > > + meta->meta_page_size = PAGE_SIZE << cpu_buffer->buffer->subbuf_order; > > Isn't this a single page? One thing we are doing is to make sure that the subbuffers are aligned by their size. If a subbuffer is 3 pages, it should be aligned on 3 page boundaries. This was something that Linus suggested. > > > + meta->meta_struct_len = sizeof(*meta); > > + meta->nr_subbufs = nr_subbufs; > > + meta->subbuf_size = cpu_buffer->buffer->subbuf_size + BUF_PAGE_HDR_SIZE; > > + > > + rb_update_meta_page(cpu_buffer); > > +} > > ... > > > +#define subbuf_page(off, start) \ > > + virt_to_page((void *)((start) + ((off) << PAGE_SHIFT))) > > + > > +#define foreach_subbuf_page(sub_order, start, page) \ > > Nit: usually iterators in kernel use for_each Ah, good catch. Yeah, that should be changed. But then ... > > > + page = subbuf_page(0, (start)); \ > > + for (int __off = 0; __off < (1 << (sub_order)); \ > > + __off++, page = subbuf_page(__off, (start))) > > The pages are allocated with alloc_pages_node(.. subbuf_order) are > physically contiguous and struct pages for them are also contiguous, so > inside a subbuf_order allocation you can just do page++. > I'm wondering if we should just nuke the macro. It was there because of the previous implementation did things twice. But now it's just done once here: + while (s < nr_subbufs && p < nr_pages) { + struct page *page; + + foreach_subbuf_page(subbuf_order, cpu_buffer->subbuf_ids[s], page) { + if (p >= nr_pages) + break; + + pages[p++] = page; + } + s++; + } Perhaps that should just be: while (s < nr_subbufs && p < nr_pages) { struct page *page; int off; page = subbuf_page(0, cpu_buffer->subbuf_ids[s]); for (off = 0; off < (1 << subbuf_order); off++, page++, p++) { if (p >= nr_pages) break; pages[p] = page; } s++; } ? -- Steve
On 06.04.24 19:36, Vincent Donnefort wrote: > In preparation for allowing the user-space to map a ring-buffer, add > a set of mapping functions: > > ring_buffer_{map,unmap}() > > And controls on the ring-buffer: > > ring_buffer_map_get_reader() /* swap reader and head */ > > Mapping the ring-buffer also involves: > > A unique ID for each subbuf of the ring-buffer, currently they are > only identified through their in-kernel VA. > > A meta-page, where are stored ring-buffer statistics and a > description for the current reader > > The linear mapping exposes the meta-page, and each subbuf of the > ring-buffer, ordered following their unique ID, assigned during the > first mapping. > > Once mapped, no subbuf can get in or out of the ring-buffer: the buffer > size will remain unmodified and the splice enabling functions will in > reality simply memcpy the data instead of swapping subbufs. > > CC: <linux-mm@kvack.org> > Signed-off-by: Vincent Donnefort <vdonnefort@google.com> > > diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h > index dc5ae4e96aee..96d2140b471e 100644 > --- a/include/linux/ring_buffer.h > +++ b/include/linux/ring_buffer.h > @@ -6,6 +6,8 @@ > #include <linux/seq_file.h> > #include <linux/poll.h> > > +#include <uapi/linux/trace_mmap.h> > + > struct trace_buffer; > struct ring_buffer_iter; > > @@ -223,4 +225,8 @@ int trace_rb_cpu_prepare(unsigned int cpu, struct hlist_node *node); > #define trace_rb_cpu_prepare NULL > #endif > > +int ring_buffer_map(struct trace_buffer *buffer, int cpu, > + struct vm_area_struct *vma); > +int ring_buffer_unmap(struct trace_buffer *buffer, int cpu); > +int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu); > #endif /* _LINUX_RING_BUFFER_H */ > diff --git a/include/uapi/linux/trace_mmap.h b/include/uapi/linux/trace_mmap.h > new file mode 100644 > index 000000000000..ffcd8dfcaa4f > --- /dev/null > +++ b/include/uapi/linux/trace_mmap.h > @@ -0,0 +1,46 @@ > +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ > +#ifndef _TRACE_MMAP_H_ > +#define _TRACE_MMAP_H_ > + > +#include <linux/types.h> > + > +/** > + * struct trace_buffer_meta - Ring-buffer Meta-page description > + * @meta_page_size: Size of this meta-page. > + * @meta_struct_len: Size of this structure. > + * @subbuf_size: Size of each sub-buffer. > + * @nr_subbufs: Number of subbfs in the ring-buffer, including the reader. > + * @reader.lost_events: Number of events lost at the time of the reader swap. > + * @reader.id: subbuf ID of the current reader. ID range [0 : @nr_subbufs - 1] > + * @reader.read: Number of bytes read on the reader subbuf. > + * @flags: Placeholder for now, 0 until new features are supported. > + * @entries: Number of entries in the ring-buffer. > + * @overrun: Number of entries lost in the ring-buffer. > + * @read: Number of entries that have been read. > + * @Reserved1: Reserved for future use. > + * @Reserved2: Reserved for future use. > + */ > +struct trace_buffer_meta { > + __u32 meta_page_size; > + __u32 meta_struct_len; > + > + __u32 subbuf_size; > + __u32 nr_subbufs; > + > + struct { > + __u64 lost_events; > + __u32 id; > + __u32 read; > + } reader; > + > + __u64 flags; > + > + __u64 entries; > + __u64 overrun; > + __u64 read; > + > + __u64 Reserved1; > + __u64 Reserved2; > +}; > + > +#endif /* _TRACE_MMAP_H_ */ > diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c > index cc9ebe593571..793ecc454039 100644 > --- a/kernel/trace/ring_buffer.c > +++ b/kernel/trace/ring_buffer.c > @@ -9,6 +9,7 @@ > #include <linux/ring_buffer.h> > #include <linux/trace_clock.h> > #include <linux/sched/clock.h> > +#include <linux/cacheflush.h> > #include <linux/trace_seq.h> > #include <linux/spinlock.h> > #include <linux/irq_work.h> > @@ -26,6 +27,7 @@ > #include <linux/list.h> > #include <linux/cpu.h> > #include <linux/oom.h> > +#include <linux/mm.h> > > #include <asm/local64.h> > #include <asm/local.h> > @@ -338,6 +340,7 @@ struct buffer_page { > local_t entries; /* entries on this page */ > unsigned long real_end; /* real end of data */ > unsigned order; /* order of the page */ > + u32 id; /* ID for external mapping */ > struct buffer_data_page *page; /* Actual data page */ > }; > > @@ -484,6 +487,12 @@ struct ring_buffer_per_cpu { > u64 read_stamp; > /* pages removed since last reset */ > unsigned long pages_removed; > + > + unsigned int mapped; > + struct mutex mapping_lock; > + unsigned long *subbuf_ids; /* ID to subbuf VA */ > + struct trace_buffer_meta *meta_page; > + > /* ring buffer pages to update, > 0 to add, < 0 to remove */ > long nr_pages_to_update; > struct list_head new_pages; /* new pages to add */ > @@ -1599,6 +1608,7 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) > init_irq_work(&cpu_buffer->irq_work.work, rb_wake_up_waiters); > init_waitqueue_head(&cpu_buffer->irq_work.waiters); > init_waitqueue_head(&cpu_buffer->irq_work.full_waiters); > + mutex_init(&cpu_buffer->mapping_lock); > > bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), > GFP_KERNEL, cpu_to_node(cpu)); > @@ -1789,8 +1799,6 @@ bool ring_buffer_time_stamp_abs(struct trace_buffer *buffer) > return buffer->time_stamp_abs; > } > > -static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer); > - > static inline unsigned long rb_page_entries(struct buffer_page *bpage) > { > return local_read(&bpage->entries) & RB_WRITE_MASK; > @@ -5211,6 +5219,22 @@ static void rb_clear_buffer_page(struct buffer_page *page) > page->read = 0; > } > > +static void rb_update_meta_page(struct ring_buffer_per_cpu *cpu_buffer) > +{ > + struct trace_buffer_meta *meta = cpu_buffer->meta_page; > + > + meta->reader.read = cpu_buffer->reader_page->read; > + meta->reader.id = cpu_buffer->reader_page->id; > + meta->reader.lost_events = cpu_buffer->lost_events; > + > + meta->entries = local_read(&cpu_buffer->entries); > + meta->overrun = local_read(&cpu_buffer->overrun); > + meta->read = cpu_buffer->read; > + > + /* Some archs do not have data cache coherency between kernel and user-space */ > + flush_dcache_folio(virt_to_folio(cpu_buffer->meta_page)); > +} > + > static void > rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) > { > @@ -5255,6 +5279,9 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) > cpu_buffer->lost_events = 0; > cpu_buffer->last_overrun = 0; > > + if (cpu_buffer->mapped) > + rb_update_meta_page(cpu_buffer); > + > rb_head_page_activate(cpu_buffer); > cpu_buffer->pages_removed = 0; > } > @@ -5469,6 +5496,12 @@ int ring_buffer_swap_cpu(struct trace_buffer *buffer_a, > cpu_buffer_a = buffer_a->buffers[cpu]; > cpu_buffer_b = buffer_b->buffers[cpu]; > > + /* It's up to the callers to not try to swap mapped buffers */ > + if (WARN_ON_ONCE(cpu_buffer_a->mapped || cpu_buffer_b->mapped)) { > + ret = -EBUSY; > + goto out; > + } > + > /* At least make sure the two buffers are somewhat the same */ > if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages) > goto out; > @@ -5733,7 +5766,8 @@ int ring_buffer_read_page(struct trace_buffer *buffer, > * Otherwise, we can simply swap the page with the one passed in. > */ > if (read || (len < (commit - read)) || > - cpu_buffer->reader_page == cpu_buffer->commit_page) { > + cpu_buffer->reader_page == cpu_buffer->commit_page || > + cpu_buffer->mapped) { > struct buffer_data_page *rpage = cpu_buffer->reader_page->page; > unsigned int rpos = read; > unsigned int pos = 0; > @@ -5956,6 +5990,11 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) > > cpu_buffer = buffer->buffers[cpu]; > > + if (cpu_buffer->mapped) { > + err = -EBUSY; > + goto error; > + } > + > /* Update the number of pages to match the new size */ > nr_pages = old_size * buffer->buffers[cpu]->nr_pages; > nr_pages = DIV_ROUND_UP(nr_pages, buffer->subbuf_size); > @@ -6057,6 +6096,358 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) > } > EXPORT_SYMBOL_GPL(ring_buffer_subbuf_order_set); > > +static int rb_alloc_meta_page(struct ring_buffer_per_cpu *cpu_buffer) > +{ > + struct page *page; > + > + if (cpu_buffer->meta_page) > + return 0; > + > + page = alloc_page(GFP_USER | __GFP_ZERO); > + if (!page) > + return -ENOMEM; > + > + cpu_buffer->meta_page = page_to_virt(page); > + > + return 0; > +} > + > +static void rb_free_meta_page(struct ring_buffer_per_cpu *cpu_buffer) > +{ > + unsigned long addr = (unsigned long)cpu_buffer->meta_page; > + > + free_page(addr); > + cpu_buffer->meta_page = NULL; > +} > + > +static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer, > + unsigned long *subbuf_ids) > +{ > + struct trace_buffer_meta *meta = cpu_buffer->meta_page; > + unsigned int nr_subbufs = cpu_buffer->nr_pages + 1; > + struct buffer_page *first_subbuf, *subbuf; > + int id = 0; > + > + subbuf_ids[id] = (unsigned long)cpu_buffer->reader_page->page; > + cpu_buffer->reader_page->id = id++; > + > + first_subbuf = subbuf = rb_set_head_page(cpu_buffer); > + do { > + if (WARN_ON(id >= nr_subbufs)) > + break; > + > + subbuf_ids[id] = (unsigned long)subbuf->page; > + subbuf->id = id; > + > + rb_inc_page(&subbuf); > + id++; > + } while (subbuf != first_subbuf); > + > + /* install subbuf ID to kern VA translation */ > + cpu_buffer->subbuf_ids = subbuf_ids; > + > + /* __rb_map_vma() pads the meta-page to align it with the sub-buffers */ > + meta->meta_page_size = PAGE_SIZE << cpu_buffer->buffer->subbuf_order; > + meta->meta_struct_len = sizeof(*meta); > + meta->nr_subbufs = nr_subbufs; > + meta->subbuf_size = cpu_buffer->buffer->subbuf_size + BUF_PAGE_HDR_SIZE; > + > + rb_update_meta_page(cpu_buffer); > +} > + > +static struct ring_buffer_per_cpu * > +rb_get_mapped_buffer(struct trace_buffer *buffer, int cpu) > +{ > + struct ring_buffer_per_cpu *cpu_buffer; > + > + if (!cpumask_test_cpu(cpu, buffer->cpumask)) > + return ERR_PTR(-EINVAL); > + > + cpu_buffer = buffer->buffers[cpu]; > + > + mutex_lock(&cpu_buffer->mapping_lock); > + > + if (!cpu_buffer->mapped) { > + mutex_unlock(&cpu_buffer->mapping_lock); > + return ERR_PTR(-ENODEV); > + } > + > + return cpu_buffer; > +} > + > +static void rb_put_mapped_buffer(struct ring_buffer_per_cpu *cpu_buffer) > +{ > + mutex_unlock(&cpu_buffer->mapping_lock); > +} > + > +/* > + * Fast-path for rb_buffer_(un)map(). Called whenever the meta-page doesn't need > + * to be set-up or torn-down. > + */ > +static int __rb_inc_dec_mapped(struct ring_buffer_per_cpu *cpu_buffer, > + bool inc) > +{ > + unsigned long flags; > + > + lockdep_assert_held(&cpu_buffer->mapping_lock); > + > + if (inc && cpu_buffer->mapped == UINT_MAX) > + return -EBUSY; > + > + if (WARN_ON(!inc && cpu_buffer->mapped == 0)) > + return -EINVAL; > + > + mutex_lock(&cpu_buffer->buffer->mutex); > + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); > + > + if (inc) > + cpu_buffer->mapped++; > + else > + cpu_buffer->mapped--; > + > + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); > + mutex_unlock(&cpu_buffer->buffer->mutex); > + > + return 0; > +} > + > +#define subbuf_page(off, start) \ > + virt_to_page((void *)((start) + ((off) << PAGE_SHIFT))) > + > +#define foreach_subbuf_page(sub_order, start, page) \ > + page = subbuf_page(0, (start)); \ > + for (int __off = 0; __off < (1 << (sub_order)); \ > + __off++, page = subbuf_page(__off, (start))) > + > +/* > + * +--------------+ pgoff == 0 > + * | meta page | > + * +--------------+ pgoff == 1 > + * | 000000000 | > + * +--------------+ pgoff == (1 << subbuf_order) > + * | subbuffer 0 | > + * | | > + * +--------------+ pgoff == (2 * (1 << subbuf_order)) > + * | subbuffer 1 | > + * | | > + * ... > + */ > +static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer, > + struct vm_area_struct *vma) > +{ > + unsigned long nr_subbufs, nr_pages, vma_pages, pgoff = vma->vm_pgoff; > + unsigned int subbuf_pages, subbuf_order; > + struct page **pages; > + int p = 0, s = 0; > + int err; > + > + lockdep_assert_held(&cpu_buffer->mapping_lock); > + > + subbuf_order = cpu_buffer->buffer->subbuf_order; > + subbuf_pages = 1 << subbuf_order; > + > + if (subbuf_order && pgoff % subbuf_pages) > + return -EINVAL; > + > + nr_subbufs = cpu_buffer->nr_pages + 1; > + nr_pages = ((nr_subbufs + 1) << subbuf_order) - pgoff; > + > + vma_pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; > + if (!vma_pages || vma_pages > nr_pages) > + return -EINVAL; > + > + nr_pages = vma_pages; > + > + pages = kcalloc(nr_pages, sizeof(*pages), GFP_KERNEL); > + if (!pages) > + return -ENOMEM; > + > + if (!pgoff) { > + unsigned long meta_page_padding; > + > + pages[p++] = virt_to_page(cpu_buffer->meta_page); > + > + /* > + * Pad with the zero-page to align the meta-page with the > + * sub-buffers. > + */ > + meta_page_padding = subbuf_pages - 1; > + while (meta_page_padding-- && p < nr_pages) > + pages[p++] = ZERO_PAGE(0); Using the shared zeropage in a MAP_SHARED mapping that is neither VM_IO nor VM_PFNMAP can be problematic. So we really need patch #3 logic to use VM_PFNMAP. It would be cleaner/more obvious if these VMA flag setting would reside here, if possible. > + } else { > + /* Skip the meta-page */ > + pgoff -= subbuf_pages; > + > + s += pgoff / subbuf_pages; > + } > + > + while (s < nr_subbufs && p < nr_pages) { > + struct page *page; > + > + foreach_subbuf_page(subbuf_order, cpu_buffer->subbuf_ids[s], page) { > + if (p >= nr_pages) > + break; > + > + pages[p++] = page; > + } > + s++; > + } > + > + err = vm_insert_pages(vma, vma->vm_start, pages, &nr_pages); I think Linus suggested it ("avoid all the sub-page ref-counts entirely by using VM_PFNMAP, and use vm_insert_pages()"), but ... vm_insert_pages() will: * Mess with mapcounts * Mess with refcounts See insert_pages()->insert_page_in_batch_locked()->insert_page_into_pte_locked(). So we'll mess with the mapcount and refcount of the shared zeropage ... hmmmm If I am not wrong, vm_normal_page() would not return the shared zeropage in case we don't have CONFIG_ARCH_HAS_PTE_SPECIAL ... so unmap()->...->zap_present_ptes() would not decrement the refcount and we could overflow it. ... we also shouldn't ever mess with the mapcount of the shared zeropage in the first place. vm_insert_page() is clearer on that: "This allows drivers to insert individual pages they've allocated into a user vma". You didn't allocate the shared zeropage. I'm wondering if we even expect VM_MIXEDMAP and VM_PFNMAP to be set at the same time? vm_insert_pages() would BUG_ON in case VM_PFNMAP is already set and it would set VM_MIXEDMAP ... similarly vmf_insert_pfn_prot() would not be happy about that at all ... BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == (VM_PFNMAP|VM_MIXEDMAP)); ... remap_pfn_range() is used by io_uring_mmap for a similar purpose. But it only supports a single PFN range at a time and requires the caller to handle refcounting of pages. It's getting late in Germany, so I might be missing something; but using the shared zeropage here might be a problem.
On 19.04.24 20:25, David Hildenbrand wrote: > On 06.04.24 19:36, Vincent Donnefort wrote: >> In preparation for allowing the user-space to map a ring-buffer, add >> a set of mapping functions: >> >> ring_buffer_{map,unmap}() >> >> And controls on the ring-buffer: >> >> ring_buffer_map_get_reader() /* swap reader and head */ >> >> Mapping the ring-buffer also involves: >> >> A unique ID for each subbuf of the ring-buffer, currently they are >> only identified through their in-kernel VA. >> >> A meta-page, where are stored ring-buffer statistics and a >> description for the current reader >> >> The linear mapping exposes the meta-page, and each subbuf of the >> ring-buffer, ordered following their unique ID, assigned during the >> first mapping. >> >> Once mapped, no subbuf can get in or out of the ring-buffer: the buffer >> size will remain unmodified and the splice enabling functions will in >> reality simply memcpy the data instead of swapping subbufs. >> >> CC: <linux-mm@kvack.org> >> Signed-off-by: Vincent Donnefort <vdonnefort@google.com> >> >> diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h >> index dc5ae4e96aee..96d2140b471e 100644 >> --- a/include/linux/ring_buffer.h >> +++ b/include/linux/ring_buffer.h >> @@ -6,6 +6,8 @@ >> #include <linux/seq_file.h> >> #include <linux/poll.h> >> >> +#include <uapi/linux/trace_mmap.h> >> + >> struct trace_buffer; >> struct ring_buffer_iter; >> >> @@ -223,4 +225,8 @@ int trace_rb_cpu_prepare(unsigned int cpu, struct hlist_node *node); >> #define trace_rb_cpu_prepare NULL >> #endif >> >> +int ring_buffer_map(struct trace_buffer *buffer, int cpu, >> + struct vm_area_struct *vma); >> +int ring_buffer_unmap(struct trace_buffer *buffer, int cpu); >> +int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu); >> #endif /* _LINUX_RING_BUFFER_H */ >> diff --git a/include/uapi/linux/trace_mmap.h b/include/uapi/linux/trace_mmap.h >> new file mode 100644 >> index 000000000000..ffcd8dfcaa4f >> --- /dev/null >> +++ b/include/uapi/linux/trace_mmap.h >> @@ -0,0 +1,46 @@ >> +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ >> +#ifndef _TRACE_MMAP_H_ >> +#define _TRACE_MMAP_H_ >> + >> +#include <linux/types.h> >> + >> +/** >> + * struct trace_buffer_meta - Ring-buffer Meta-page description >> + * @meta_page_size: Size of this meta-page. >> + * @meta_struct_len: Size of this structure. >> + * @subbuf_size: Size of each sub-buffer. >> + * @nr_subbufs: Number of subbfs in the ring-buffer, including the reader. >> + * @reader.lost_events: Number of events lost at the time of the reader swap. >> + * @reader.id: subbuf ID of the current reader. ID range [0 : @nr_subbufs - 1] >> + * @reader.read: Number of bytes read on the reader subbuf. >> + * @flags: Placeholder for now, 0 until new features are supported. >> + * @entries: Number of entries in the ring-buffer. >> + * @overrun: Number of entries lost in the ring-buffer. >> + * @read: Number of entries that have been read. >> + * @Reserved1: Reserved for future use. >> + * @Reserved2: Reserved for future use. >> + */ >> +struct trace_buffer_meta { >> + __u32 meta_page_size; >> + __u32 meta_struct_len; >> + >> + __u32 subbuf_size; >> + __u32 nr_subbufs; >> + >> + struct { >> + __u64 lost_events; >> + __u32 id; >> + __u32 read; >> + } reader; >> + >> + __u64 flags; >> + >> + __u64 entries; >> + __u64 overrun; >> + __u64 read; >> + >> + __u64 Reserved1; >> + __u64 Reserved2; >> +}; >> + >> +#endif /* _TRACE_MMAP_H_ */ >> diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c >> index cc9ebe593571..793ecc454039 100644 >> --- a/kernel/trace/ring_buffer.c >> +++ b/kernel/trace/ring_buffer.c >> @@ -9,6 +9,7 @@ >> #include <linux/ring_buffer.h> >> #include <linux/trace_clock.h> >> #include <linux/sched/clock.h> >> +#include <linux/cacheflush.h> >> #include <linux/trace_seq.h> >> #include <linux/spinlock.h> >> #include <linux/irq_work.h> >> @@ -26,6 +27,7 @@ >> #include <linux/list.h> >> #include <linux/cpu.h> >> #include <linux/oom.h> >> +#include <linux/mm.h> >> >> #include <asm/local64.h> >> #include <asm/local.h> >> @@ -338,6 +340,7 @@ struct buffer_page { >> local_t entries; /* entries on this page */ >> unsigned long real_end; /* real end of data */ >> unsigned order; /* order of the page */ >> + u32 id; /* ID for external mapping */ >> struct buffer_data_page *page; /* Actual data page */ >> }; >> >> @@ -484,6 +487,12 @@ struct ring_buffer_per_cpu { >> u64 read_stamp; >> /* pages removed since last reset */ >> unsigned long pages_removed; >> + >> + unsigned int mapped; >> + struct mutex mapping_lock; >> + unsigned long *subbuf_ids; /* ID to subbuf VA */ >> + struct trace_buffer_meta *meta_page; >> + >> /* ring buffer pages to update, > 0 to add, < 0 to remove */ >> long nr_pages_to_update; >> struct list_head new_pages; /* new pages to add */ >> @@ -1599,6 +1608,7 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) >> init_irq_work(&cpu_buffer->irq_work.work, rb_wake_up_waiters); >> init_waitqueue_head(&cpu_buffer->irq_work.waiters); >> init_waitqueue_head(&cpu_buffer->irq_work.full_waiters); >> + mutex_init(&cpu_buffer->mapping_lock); >> >> bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), >> GFP_KERNEL, cpu_to_node(cpu)); >> @@ -1789,8 +1799,6 @@ bool ring_buffer_time_stamp_abs(struct trace_buffer *buffer) >> return buffer->time_stamp_abs; >> } >> >> -static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer); >> - >> static inline unsigned long rb_page_entries(struct buffer_page *bpage) >> { >> return local_read(&bpage->entries) & RB_WRITE_MASK; >> @@ -5211,6 +5219,22 @@ static void rb_clear_buffer_page(struct buffer_page *page) >> page->read = 0; >> } >> >> +static void rb_update_meta_page(struct ring_buffer_per_cpu *cpu_buffer) >> +{ >> + struct trace_buffer_meta *meta = cpu_buffer->meta_page; >> + >> + meta->reader.read = cpu_buffer->reader_page->read; >> + meta->reader.id = cpu_buffer->reader_page->id; >> + meta->reader.lost_events = cpu_buffer->lost_events; >> + >> + meta->entries = local_read(&cpu_buffer->entries); >> + meta->overrun = local_read(&cpu_buffer->overrun); >> + meta->read = cpu_buffer->read; >> + >> + /* Some archs do not have data cache coherency between kernel and user-space */ >> + flush_dcache_folio(virt_to_folio(cpu_buffer->meta_page)); >> +} >> + >> static void >> rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) >> { >> @@ -5255,6 +5279,9 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) >> cpu_buffer->lost_events = 0; >> cpu_buffer->last_overrun = 0; >> >> + if (cpu_buffer->mapped) >> + rb_update_meta_page(cpu_buffer); >> + >> rb_head_page_activate(cpu_buffer); >> cpu_buffer->pages_removed = 0; >> } >> @@ -5469,6 +5496,12 @@ int ring_buffer_swap_cpu(struct trace_buffer *buffer_a, >> cpu_buffer_a = buffer_a->buffers[cpu]; >> cpu_buffer_b = buffer_b->buffers[cpu]; >> >> + /* It's up to the callers to not try to swap mapped buffers */ >> + if (WARN_ON_ONCE(cpu_buffer_a->mapped || cpu_buffer_b->mapped)) { >> + ret = -EBUSY; >> + goto out; >> + } >> + >> /* At least make sure the two buffers are somewhat the same */ >> if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages) >> goto out; >> @@ -5733,7 +5766,8 @@ int ring_buffer_read_page(struct trace_buffer *buffer, >> * Otherwise, we can simply swap the page with the one passed in. >> */ >> if (read || (len < (commit - read)) || >> - cpu_buffer->reader_page == cpu_buffer->commit_page) { >> + cpu_buffer->reader_page == cpu_buffer->commit_page || >> + cpu_buffer->mapped) { >> struct buffer_data_page *rpage = cpu_buffer->reader_page->page; >> unsigned int rpos = read; >> unsigned int pos = 0; >> @@ -5956,6 +5990,11 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) >> >> cpu_buffer = buffer->buffers[cpu]; >> >> + if (cpu_buffer->mapped) { >> + err = -EBUSY; >> + goto error; >> + } >> + >> /* Update the number of pages to match the new size */ >> nr_pages = old_size * buffer->buffers[cpu]->nr_pages; >> nr_pages = DIV_ROUND_UP(nr_pages, buffer->subbuf_size); >> @@ -6057,6 +6096,358 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) >> } >> EXPORT_SYMBOL_GPL(ring_buffer_subbuf_order_set); >> >> +static int rb_alloc_meta_page(struct ring_buffer_per_cpu *cpu_buffer) >> +{ >> + struct page *page; >> + >> + if (cpu_buffer->meta_page) >> + return 0; >> + >> + page = alloc_page(GFP_USER | __GFP_ZERO); >> + if (!page) >> + return -ENOMEM; >> + >> + cpu_buffer->meta_page = page_to_virt(page); >> + >> + return 0; >> +} >> + >> +static void rb_free_meta_page(struct ring_buffer_per_cpu *cpu_buffer) >> +{ >> + unsigned long addr = (unsigned long)cpu_buffer->meta_page; >> + >> + free_page(addr); >> + cpu_buffer->meta_page = NULL; >> +} >> + >> +static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer, >> + unsigned long *subbuf_ids) >> +{ >> + struct trace_buffer_meta *meta = cpu_buffer->meta_page; >> + unsigned int nr_subbufs = cpu_buffer->nr_pages + 1; >> + struct buffer_page *first_subbuf, *subbuf; >> + int id = 0; >> + >> + subbuf_ids[id] = (unsigned long)cpu_buffer->reader_page->page; >> + cpu_buffer->reader_page->id = id++; >> + >> + first_subbuf = subbuf = rb_set_head_page(cpu_buffer); >> + do { >> + if (WARN_ON(id >= nr_subbufs)) >> + break; >> + >> + subbuf_ids[id] = (unsigned long)subbuf->page; >> + subbuf->id = id; >> + >> + rb_inc_page(&subbuf); >> + id++; >> + } while (subbuf != first_subbuf); >> + >> + /* install subbuf ID to kern VA translation */ >> + cpu_buffer->subbuf_ids = subbuf_ids; >> + >> + /* __rb_map_vma() pads the meta-page to align it with the sub-buffers */ >> + meta->meta_page_size = PAGE_SIZE << cpu_buffer->buffer->subbuf_order; >> + meta->meta_struct_len = sizeof(*meta); >> + meta->nr_subbufs = nr_subbufs; >> + meta->subbuf_size = cpu_buffer->buffer->subbuf_size + BUF_PAGE_HDR_SIZE; >> + >> + rb_update_meta_page(cpu_buffer); >> +} >> + >> +static struct ring_buffer_per_cpu * >> +rb_get_mapped_buffer(struct trace_buffer *buffer, int cpu) >> +{ >> + struct ring_buffer_per_cpu *cpu_buffer; >> + >> + if (!cpumask_test_cpu(cpu, buffer->cpumask)) >> + return ERR_PTR(-EINVAL); >> + >> + cpu_buffer = buffer->buffers[cpu]; >> + >> + mutex_lock(&cpu_buffer->mapping_lock); >> + >> + if (!cpu_buffer->mapped) { >> + mutex_unlock(&cpu_buffer->mapping_lock); >> + return ERR_PTR(-ENODEV); >> + } >> + >> + return cpu_buffer; >> +} >> + >> +static void rb_put_mapped_buffer(struct ring_buffer_per_cpu *cpu_buffer) >> +{ >> + mutex_unlock(&cpu_buffer->mapping_lock); >> +} >> + >> +/* >> + * Fast-path for rb_buffer_(un)map(). Called whenever the meta-page doesn't need >> + * to be set-up or torn-down. >> + */ >> +static int __rb_inc_dec_mapped(struct ring_buffer_per_cpu *cpu_buffer, >> + bool inc) >> +{ >> + unsigned long flags; >> + >> + lockdep_assert_held(&cpu_buffer->mapping_lock); >> + >> + if (inc && cpu_buffer->mapped == UINT_MAX) >> + return -EBUSY; >> + >> + if (WARN_ON(!inc && cpu_buffer->mapped == 0)) >> + return -EINVAL; >> + >> + mutex_lock(&cpu_buffer->buffer->mutex); >> + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); >> + >> + if (inc) >> + cpu_buffer->mapped++; >> + else >> + cpu_buffer->mapped--; >> + >> + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); >> + mutex_unlock(&cpu_buffer->buffer->mutex); >> + >> + return 0; >> +} >> + >> +#define subbuf_page(off, start) \ >> + virt_to_page((void *)((start) + ((off) << PAGE_SHIFT))) >> + >> +#define foreach_subbuf_page(sub_order, start, page) \ >> + page = subbuf_page(0, (start)); \ >> + for (int __off = 0; __off < (1 << (sub_order)); \ >> + __off++, page = subbuf_page(__off, (start))) >> + >> +/* >> + * +--------------+ pgoff == 0 >> + * | meta page | >> + * +--------------+ pgoff == 1 >> + * | 000000000 | >> + * +--------------+ pgoff == (1 << subbuf_order) >> + * | subbuffer 0 | >> + * | | >> + * +--------------+ pgoff == (2 * (1 << subbuf_order)) >> + * | subbuffer 1 | >> + * | | >> + * ... >> + */ >> +static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer, >> + struct vm_area_struct *vma) >> +{ >> + unsigned long nr_subbufs, nr_pages, vma_pages, pgoff = vma->vm_pgoff; >> + unsigned int subbuf_pages, subbuf_order; >> + struct page **pages; >> + int p = 0, s = 0; >> + int err; >> + >> + lockdep_assert_held(&cpu_buffer->mapping_lock); >> + >> + subbuf_order = cpu_buffer->buffer->subbuf_order; >> + subbuf_pages = 1 << subbuf_order; >> + >> + if (subbuf_order && pgoff % subbuf_pages) >> + return -EINVAL; >> + >> + nr_subbufs = cpu_buffer->nr_pages + 1; >> + nr_pages = ((nr_subbufs + 1) << subbuf_order) - pgoff; >> + >> + vma_pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; >> + if (!vma_pages || vma_pages > nr_pages) >> + return -EINVAL; >> + >> + nr_pages = vma_pages; >> + >> + pages = kcalloc(nr_pages, sizeof(*pages), GFP_KERNEL); >> + if (!pages) >> + return -ENOMEM; >> + >> + if (!pgoff) { >> + unsigned long meta_page_padding; >> + >> + pages[p++] = virt_to_page(cpu_buffer->meta_page); >> + >> + /* >> + * Pad with the zero-page to align the meta-page with the >> + * sub-buffers. >> + */ >> + meta_page_padding = subbuf_pages - 1; >> + while (meta_page_padding-- && p < nr_pages) >> + pages[p++] = ZERO_PAGE(0); > > Using the shared zeropage in a MAP_SHARED mapping that is neither VM_IO > nor VM_PFNMAP can be problematic. So we really need patch #3 logic to > use VM_PFNMAP. > > It would be cleaner/more obvious if these VMA flag setting would reside > here, if possible. > >> + } else { >> + /* Skip the meta-page */ >> + pgoff -= subbuf_pages; >> + >> + s += pgoff / subbuf_pages; >> + } >> + >> + while (s < nr_subbufs && p < nr_pages) { >> + struct page *page; >> + >> + foreach_subbuf_page(subbuf_order, cpu_buffer->subbuf_ids[s], page) { >> + if (p >= nr_pages) >> + break; >> + >> + pages[p++] = page; >> + } >> + s++; >> + } >> + >> + err = vm_insert_pages(vma, vma->vm_start, pages, &nr_pages); > > I think Linus suggested it ("avoid all the sub-page ref-counts entirely > by using VM_PFNMAP, and use vm_insert_pages()"), but ... > vm_insert_pages() will: > * Mess with mapcounts > * Mess with refcounts > > See > insert_pages()->insert_page_in_batch_locked()->insert_page_into_pte_locked(). > > So we'll mess with the mapcount and refcount of the shared zeropage ... > hmmmm > > If I am not wrong, vm_normal_page() would not return the shared zeropage > in case we don't have CONFIG_ARCH_HAS_PTE_SPECIAL ... so > unmap()->...->zap_present_ptes() would not decrement the refcount and we > could overflow it. ... we also shouldn't ever mess with the mapcount of > the shared zeropage in the first place. > > vm_insert_page() is clearer on that: "This allows drivers to insert > individual pages they've allocated into a user vma". You didn't allocate > the shared zeropage. > > I'm wondering if we even expect VM_MIXEDMAP and VM_PFNMAP to be set at > the same time? vm_insert_pages() would BUG_ON in case VM_PFNMAP is > already set and it would set VM_MIXEDMAP ... similarly > vmf_insert_pfn_prot() would not be happy about that at all ... > > BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == > (VM_PFNMAP|VM_MIXEDMAP)); > > ... remap_pfn_range() is used by io_uring_mmap for a similar purpose. > But it only supports a single PFN range at a time and requires the > caller to handle refcounting of pages. > > It's getting late in Germany, so I might be missing something; but using > the shared zeropage here might be a problem. > I was just about to write code to cleanly support vm_insert_pages() to consume zeropages ... when I started to wonder about something else: + if (vma->vm_flags & VM_WRITE || vma->vm_flags & VM_EXEC || + !(vma->vm_flags & VM_MAYSHARE)) + return -EPERM; + You disallow writable mappings. But what about using mprotect() afterwards to allow for write permissions? Likely you'd want to remove VM_MAYWRITE from the flags, otherwise mprotect() might alter succeed.
On Thu, Apr 18, 2024 at 11:43:46PM -0400, Steven Rostedt wrote: > On Thu, 18 Apr 2024 09:55:55 +0300 > Mike Rapoport <rppt@kernel.org> wrote: > > Hi Mike, > > Thanks for doing this review! > > > > +/** > > > + * struct trace_buffer_meta - Ring-buffer Meta-page description > > > + * @meta_page_size: Size of this meta-page. > > > + * @meta_struct_len: Size of this structure. > > > + * @subbuf_size: Size of each sub-buffer. > > > + * @nr_subbufs: Number of subbfs in the ring-buffer, including the reader. > > > + * @reader.lost_events: Number of events lost at the time of the reader swap. > > > + * @reader.id: subbuf ID of the current reader. ID range [0 : @nr_subbufs - 1] > > > + * @reader.read: Number of bytes read on the reader subbuf. > > > + * @flags: Placeholder for now, 0 until new features are supported. > > > + * @entries: Number of entries in the ring-buffer. > > > + * @overrun: Number of entries lost in the ring-buffer. > > > + * @read: Number of entries that have been read. > > > + * @Reserved1: Reserved for future use. > > > + * @Reserved2: Reserved for future use. > > > + */ > > > +struct trace_buffer_meta { > > > + __u32 meta_page_size; > > > + __u32 meta_struct_len; > > > + > > > + __u32 subbuf_size; > > > + __u32 nr_subbufs; > > > + > > > + struct { > > > + __u64 lost_events; > > > + __u32 id; > > > + __u32 read; > > > + } reader; > > > + > > > + __u64 flags; > > > + > > > + __u64 entries; > > > + __u64 overrun; > > > + __u64 read; > > > + > > > + __u64 Reserved1; > > > + __u64 Reserved2; > > > > Why do you need reserved fields? This structure always resides in the > > beginning of a page and the rest of the page is essentially "reserved". > > So this code is also going to be used in arm's pkvm hypervisor code, > where it will be using these fields, but since we are looking at > keeping the same interface between the two, we don't want these used by > this interface. > > We probably should add a comment about that. > > > > > > +}; > > > + > > > +#endif /* _TRACE_MMAP_H_ */ > > > diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c > > > index cc9ebe593571..793ecc454039 100644 > > > --- a/kernel/trace/ring_buffer.c > > > +++ b/kernel/trace/ring_buffer.c > > > > ... > > > > > +static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer, > > > + unsigned long *subbuf_ids) > > > +{ > > > + struct trace_buffer_meta *meta = cpu_buffer->meta_page; > > > + unsigned int nr_subbufs = cpu_buffer->nr_pages + 1; > > > + struct buffer_page *first_subbuf, *subbuf; > > > + int id = 0; > > > + > > > + subbuf_ids[id] = (unsigned long)cpu_buffer->reader_page->page; > > > + cpu_buffer->reader_page->id = id++; > > > + > > > + first_subbuf = subbuf = rb_set_head_page(cpu_buffer); > > > + do { > > > + if (WARN_ON(id >= nr_subbufs)) > > > + break; > > > + > > > + subbuf_ids[id] = (unsigned long)subbuf->page; > > > + subbuf->id = id; > > > + > > > + rb_inc_page(&subbuf); > > > + id++; > > > + } while (subbuf != first_subbuf); > > > + > > > + /* install subbuf ID to kern VA translation */ > > > + cpu_buffer->subbuf_ids = subbuf_ids; > > > + > > > + /* __rb_map_vma() pads the meta-page to align it with the sub-buffers */ > > > + meta->meta_page_size = PAGE_SIZE << cpu_buffer->buffer->subbuf_order; > > > > Isn't this a single page? > > One thing we are doing is to make sure that the subbuffers are aligned > by their size. If a subbuffer is 3 pages, it should be aligned on 3 > page boundaries. This was something that Linus suggested. > > > > > > + meta->meta_struct_len = sizeof(*meta); > > > + meta->nr_subbufs = nr_subbufs; > > > + meta->subbuf_size = cpu_buffer->buffer->subbuf_size + BUF_PAGE_HDR_SIZE; > > > + > > > + rb_update_meta_page(cpu_buffer); > > > +} > > > > ... > > > > > +#define subbuf_page(off, start) \ > > > + virt_to_page((void *)((start) + ((off) << PAGE_SHIFT))) > > > + > > > +#define foreach_subbuf_page(sub_order, start, page) \ > > > > Nit: usually iterators in kernel use for_each > > Ah, good catch. Yeah, that should be changed. But then ... > > > > > > + page = subbuf_page(0, (start)); \ > > > + for (int __off = 0; __off < (1 << (sub_order)); \ > > > + __off++, page = subbuf_page(__off, (start))) > > > > The pages are allocated with alloc_pages_node(.. subbuf_order) are > > physically contiguous and struct pages for them are also contiguous, so > > inside a subbuf_order allocation you can just do page++. > > > > I'm wondering if we should just nuke the macro. It was there because of > the previous implementation did things twice. But now it's just done > once here: > > + while (s < nr_subbufs && p < nr_pages) { > + struct page *page; > + > + foreach_subbuf_page(subbuf_order, cpu_buffer->subbuf_ids[s], page) { > + if (p >= nr_pages) > + break; > + > + pages[p++] = page; > + } > + s++; > + } > > Perhaps that should just be: > > while (s < nr_subbufs && p < nr_pages) { > struct page *page; > int off; > > page = subbuf_page(0, cpu_buffer->subbuf_ids[s]); > for (off = 0; off < (1 << subbuf_order); off++, page++, p++) { > if (p >= nr_pages) > break; > > pages[p] = page; > } > s++; > } > > ? Yeah, was hesitating to kill it with the last version. Happy to do it for the next one. > > -- Steve
Hi David, Thanks for having a look, very much appreciated! On Mon, Apr 22, 2024 at 11:27:11AM +0200, David Hildenbrand wrote: > On 19.04.24 20:25, David Hildenbrand wrote: > > On 06.04.24 19:36, Vincent Donnefort wrote: > > > In preparation for allowing the user-space to map a ring-buffer, add > > > a set of mapping functions: > > > > > > ring_buffer_{map,unmap}() > > > > > > And controls on the ring-buffer: > > > > > > ring_buffer_map_get_reader() /* swap reader and head */ > > > > > > Mapping the ring-buffer also involves: > > > > > > A unique ID for each subbuf of the ring-buffer, currently they are > > > only identified through their in-kernel VA. > > > > > > A meta-page, where are stored ring-buffer statistics and a > > > description for the current reader > > > > > > The linear mapping exposes the meta-page, and each subbuf of the > > > ring-buffer, ordered following their unique ID, assigned during the > > > first mapping. > > > > > > Once mapped, no subbuf can get in or out of the ring-buffer: the buffer > > > size will remain unmodified and the splice enabling functions will in > > > reality simply memcpy the data instead of swapping subbufs. > > > > > > CC: <linux-mm@kvack.org> > > > Signed-off-by: Vincent Donnefort <vdonnefort@google.com> > > > > > > diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h > > > index dc5ae4e96aee..96d2140b471e 100644 > > > --- a/include/linux/ring_buffer.h > > > +++ b/include/linux/ring_buffer.h > > > @@ -6,6 +6,8 @@ > > > #include <linux/seq_file.h> > > > #include <linux/poll.h> > > > +#include <uapi/linux/trace_mmap.h> > > > + > > > struct trace_buffer; > > > struct ring_buffer_iter; > > > @@ -223,4 +225,8 @@ int trace_rb_cpu_prepare(unsigned int cpu, struct hlist_node *node); > > > #define trace_rb_cpu_prepare NULL > > > #endif > > > +int ring_buffer_map(struct trace_buffer *buffer, int cpu, > > > + struct vm_area_struct *vma); > > > +int ring_buffer_unmap(struct trace_buffer *buffer, int cpu); > > > +int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu); > > > #endif /* _LINUX_RING_BUFFER_H */ > > > diff --git a/include/uapi/linux/trace_mmap.h b/include/uapi/linux/trace_mmap.h > > > new file mode 100644 > > > index 000000000000..ffcd8dfcaa4f > > > --- /dev/null > > > +++ b/include/uapi/linux/trace_mmap.h > > > @@ -0,0 +1,46 @@ > > > +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ > > > +#ifndef _TRACE_MMAP_H_ > > > +#define _TRACE_MMAP_H_ > > > + > > > +#include <linux/types.h> > > > + > > > +/** > > > + * struct trace_buffer_meta - Ring-buffer Meta-page description > > > + * @meta_page_size: Size of this meta-page. > > > + * @meta_struct_len: Size of this structure. > > > + * @subbuf_size: Size of each sub-buffer. > > > + * @nr_subbufs: Number of subbfs in the ring-buffer, including the reader. > > > + * @reader.lost_events: Number of events lost at the time of the reader swap. > > > + * @reader.id: subbuf ID of the current reader. ID range [0 : @nr_subbufs - 1] > > > + * @reader.read: Number of bytes read on the reader subbuf. > > > + * @flags: Placeholder for now, 0 until new features are supported. > > > + * @entries: Number of entries in the ring-buffer. > > > + * @overrun: Number of entries lost in the ring-buffer. > > > + * @read: Number of entries that have been read. > > > + * @Reserved1: Reserved for future use. > > > + * @Reserved2: Reserved for future use. > > > + */ > > > +struct trace_buffer_meta { > > > + __u32 meta_page_size; > > > + __u32 meta_struct_len; > > > + > > > + __u32 subbuf_size; > > > + __u32 nr_subbufs; > > > + > > > + struct { > > > + __u64 lost_events; > > > + __u32 id; > > > + __u32 read; > > > + } reader; > > > + > > > + __u64 flags; > > > + > > > + __u64 entries; > > > + __u64 overrun; > > > + __u64 read; > > > + > > > + __u64 Reserved1; > > > + __u64 Reserved2; > > > +}; > > > + > > > +#endif /* _TRACE_MMAP_H_ */ > > > diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c > > > index cc9ebe593571..793ecc454039 100644 > > > --- a/kernel/trace/ring_buffer.c > > > +++ b/kernel/trace/ring_buffer.c > > > @@ -9,6 +9,7 @@ > > > #include <linux/ring_buffer.h> > > > #include <linux/trace_clock.h> > > > #include <linux/sched/clock.h> > > > +#include <linux/cacheflush.h> > > > #include <linux/trace_seq.h> > > > #include <linux/spinlock.h> > > > #include <linux/irq_work.h> > > > @@ -26,6 +27,7 @@ > > > #include <linux/list.h> > > > #include <linux/cpu.h> > > > #include <linux/oom.h> > > > +#include <linux/mm.h> > > > #include <asm/local64.h> > > > #include <asm/local.h> > > > @@ -338,6 +340,7 @@ struct buffer_page { > > > local_t entries; /* entries on this page */ > > > unsigned long real_end; /* real end of data */ > > > unsigned order; /* order of the page */ > > > + u32 id; /* ID for external mapping */ > > > struct buffer_data_page *page; /* Actual data page */ > > > }; > > > @@ -484,6 +487,12 @@ struct ring_buffer_per_cpu { > > > u64 read_stamp; > > > /* pages removed since last reset */ > > > unsigned long pages_removed; > > > + > > > + unsigned int mapped; > > > + struct mutex mapping_lock; > > > + unsigned long *subbuf_ids; /* ID to subbuf VA */ > > > + struct trace_buffer_meta *meta_page; > > > + > > > /* ring buffer pages to update, > 0 to add, < 0 to remove */ > > > long nr_pages_to_update; > > > struct list_head new_pages; /* new pages to add */ > > > @@ -1599,6 +1608,7 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) > > > init_irq_work(&cpu_buffer->irq_work.work, rb_wake_up_waiters); > > > init_waitqueue_head(&cpu_buffer->irq_work.waiters); > > > init_waitqueue_head(&cpu_buffer->irq_work.full_waiters); > > > + mutex_init(&cpu_buffer->mapping_lock); > > > bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), > > > GFP_KERNEL, cpu_to_node(cpu)); > > > @@ -1789,8 +1799,6 @@ bool ring_buffer_time_stamp_abs(struct trace_buffer *buffer) > > > return buffer->time_stamp_abs; > > > } > > > -static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer); > > > - > > > static inline unsigned long rb_page_entries(struct buffer_page *bpage) > > > { > > > return local_read(&bpage->entries) & RB_WRITE_MASK; > > > @@ -5211,6 +5219,22 @@ static void rb_clear_buffer_page(struct buffer_page *page) > > > page->read = 0; > > > } > > > +static void rb_update_meta_page(struct ring_buffer_per_cpu *cpu_buffer) > > > +{ > > > + struct trace_buffer_meta *meta = cpu_buffer->meta_page; > > > + > > > + meta->reader.read = cpu_buffer->reader_page->read; > > > + meta->reader.id = cpu_buffer->reader_page->id; > > > + meta->reader.lost_events = cpu_buffer->lost_events; > > > + > > > + meta->entries = local_read(&cpu_buffer->entries); > > > + meta->overrun = local_read(&cpu_buffer->overrun); > > > + meta->read = cpu_buffer->read; > > > + > > > + /* Some archs do not have data cache coherency between kernel and user-space */ > > > + flush_dcache_folio(virt_to_folio(cpu_buffer->meta_page)); > > > +} > > > + > > > static void > > > rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) > > > { > > > @@ -5255,6 +5279,9 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) > > > cpu_buffer->lost_events = 0; > > > cpu_buffer->last_overrun = 0; > > > + if (cpu_buffer->mapped) > > > + rb_update_meta_page(cpu_buffer); > > > + > > > rb_head_page_activate(cpu_buffer); > > > cpu_buffer->pages_removed = 0; > > > } > > > @@ -5469,6 +5496,12 @@ int ring_buffer_swap_cpu(struct trace_buffer *buffer_a, > > > cpu_buffer_a = buffer_a->buffers[cpu]; > > > cpu_buffer_b = buffer_b->buffers[cpu]; > > > + /* It's up to the callers to not try to swap mapped buffers */ > > > + if (WARN_ON_ONCE(cpu_buffer_a->mapped || cpu_buffer_b->mapped)) { > > > + ret = -EBUSY; > > > + goto out; > > > + } > > > + > > > /* At least make sure the two buffers are somewhat the same */ > > > if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages) > > > goto out; > > > @@ -5733,7 +5766,8 @@ int ring_buffer_read_page(struct trace_buffer *buffer, > > > * Otherwise, we can simply swap the page with the one passed in. > > > */ > > > if (read || (len < (commit - read)) || > > > - cpu_buffer->reader_page == cpu_buffer->commit_page) { > > > + cpu_buffer->reader_page == cpu_buffer->commit_page || > > > + cpu_buffer->mapped) { > > > struct buffer_data_page *rpage = cpu_buffer->reader_page->page; > > > unsigned int rpos = read; > > > unsigned int pos = 0; > > > @@ -5956,6 +5990,11 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) > > > cpu_buffer = buffer->buffers[cpu]; > > > + if (cpu_buffer->mapped) { > > > + err = -EBUSY; > > > + goto error; > > > + } > > > + > > > /* Update the number of pages to match the new size */ > > > nr_pages = old_size * buffer->buffers[cpu]->nr_pages; > > > nr_pages = DIV_ROUND_UP(nr_pages, buffer->subbuf_size); > > > @@ -6057,6 +6096,358 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) > > > } > > > EXPORT_SYMBOL_GPL(ring_buffer_subbuf_order_set); > > > +static int rb_alloc_meta_page(struct ring_buffer_per_cpu *cpu_buffer) > > > +{ > > > + struct page *page; > > > + > > > + if (cpu_buffer->meta_page) > > > + return 0; > > > + > > > + page = alloc_page(GFP_USER | __GFP_ZERO); > > > + if (!page) > > > + return -ENOMEM; > > > + > > > + cpu_buffer->meta_page = page_to_virt(page); > > > + > > > + return 0; > > > +} > > > + > > > +static void rb_free_meta_page(struct ring_buffer_per_cpu *cpu_buffer) > > > +{ > > > + unsigned long addr = (unsigned long)cpu_buffer->meta_page; > > > + > > > + free_page(addr); > > > + cpu_buffer->meta_page = NULL; > > > +} > > > + > > > +static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer, > > > + unsigned long *subbuf_ids) > > > +{ > > > + struct trace_buffer_meta *meta = cpu_buffer->meta_page; > > > + unsigned int nr_subbufs = cpu_buffer->nr_pages + 1; > > > + struct buffer_page *first_subbuf, *subbuf; > > > + int id = 0; > > > + > > > + subbuf_ids[id] = (unsigned long)cpu_buffer->reader_page->page; > > > + cpu_buffer->reader_page->id = id++; > > > + > > > + first_subbuf = subbuf = rb_set_head_page(cpu_buffer); > > > + do { > > > + if (WARN_ON(id >= nr_subbufs)) > > > + break; > > > + > > > + subbuf_ids[id] = (unsigned long)subbuf->page; > > > + subbuf->id = id; > > > + > > > + rb_inc_page(&subbuf); > > > + id++; > > > + } while (subbuf != first_subbuf); > > > + > > > + /* install subbuf ID to kern VA translation */ > > > + cpu_buffer->subbuf_ids = subbuf_ids; > > > + > > > + /* __rb_map_vma() pads the meta-page to align it with the sub-buffers */ > > > + meta->meta_page_size = PAGE_SIZE << cpu_buffer->buffer->subbuf_order; > > > + meta->meta_struct_len = sizeof(*meta); > > > + meta->nr_subbufs = nr_subbufs; > > > + meta->subbuf_size = cpu_buffer->buffer->subbuf_size + BUF_PAGE_HDR_SIZE; > > > + > > > + rb_update_meta_page(cpu_buffer); > > > +} > > > + > > > +static struct ring_buffer_per_cpu * > > > +rb_get_mapped_buffer(struct trace_buffer *buffer, int cpu) > > > +{ > > > + struct ring_buffer_per_cpu *cpu_buffer; > > > + > > > + if (!cpumask_test_cpu(cpu, buffer->cpumask)) > > > + return ERR_PTR(-EINVAL); > > > + > > > + cpu_buffer = buffer->buffers[cpu]; > > > + > > > + mutex_lock(&cpu_buffer->mapping_lock); > > > + > > > + if (!cpu_buffer->mapped) { > > > + mutex_unlock(&cpu_buffer->mapping_lock); > > > + return ERR_PTR(-ENODEV); > > > + } > > > + > > > + return cpu_buffer; > > > +} > > > + > > > +static void rb_put_mapped_buffer(struct ring_buffer_per_cpu *cpu_buffer) > > > +{ > > > + mutex_unlock(&cpu_buffer->mapping_lock); > > > +} > > > + > > > +/* > > > + * Fast-path for rb_buffer_(un)map(). Called whenever the meta-page doesn't need > > > + * to be set-up or torn-down. > > > + */ > > > +static int __rb_inc_dec_mapped(struct ring_buffer_per_cpu *cpu_buffer, > > > + bool inc) > > > +{ > > > + unsigned long flags; > > > + > > > + lockdep_assert_held(&cpu_buffer->mapping_lock); > > > + > > > + if (inc && cpu_buffer->mapped == UINT_MAX) > > > + return -EBUSY; > > > + > > > + if (WARN_ON(!inc && cpu_buffer->mapped == 0)) > > > + return -EINVAL; > > > + > > > + mutex_lock(&cpu_buffer->buffer->mutex); > > > + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); > > > + > > > + if (inc) > > > + cpu_buffer->mapped++; > > > + else > > > + cpu_buffer->mapped--; > > > + > > > + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); > > > + mutex_unlock(&cpu_buffer->buffer->mutex); > > > + > > > + return 0; > > > +} > > > + > > > +#define subbuf_page(off, start) \ > > > + virt_to_page((void *)((start) + ((off) << PAGE_SHIFT))) > > > + > > > +#define foreach_subbuf_page(sub_order, start, page) \ > > > + page = subbuf_page(0, (start)); \ > > > + for (int __off = 0; __off < (1 << (sub_order)); \ > > > + __off++, page = subbuf_page(__off, (start))) > > > + > > > +/* > > > + * +--------------+ pgoff == 0 > > > + * | meta page | > > > + * +--------------+ pgoff == 1 > > > + * | 000000000 | > > > + * +--------------+ pgoff == (1 << subbuf_order) > > > + * | subbuffer 0 | > > > + * | | > > > + * +--------------+ pgoff == (2 * (1 << subbuf_order)) > > > + * | subbuffer 1 | > > > + * | | > > > + * ... > > > + */ > > > +static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer, > > > + struct vm_area_struct *vma) > > > +{ > > > + unsigned long nr_subbufs, nr_pages, vma_pages, pgoff = vma->vm_pgoff; > > > + unsigned int subbuf_pages, subbuf_order; > > > + struct page **pages; > > > + int p = 0, s = 0; > > > + int err; > > > + > > > + lockdep_assert_held(&cpu_buffer->mapping_lock); > > > + > > > + subbuf_order = cpu_buffer->buffer->subbuf_order; > > > + subbuf_pages = 1 << subbuf_order; > > > + > > > + if (subbuf_order && pgoff % subbuf_pages) > > > + return -EINVAL; > > > + > > > + nr_subbufs = cpu_buffer->nr_pages + 1; > > > + nr_pages = ((nr_subbufs + 1) << subbuf_order) - pgoff; > > > + > > > + vma_pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; > > > + if (!vma_pages || vma_pages > nr_pages) > > > + return -EINVAL; > > > + > > > + nr_pages = vma_pages; > > > + > > > + pages = kcalloc(nr_pages, sizeof(*pages), GFP_KERNEL); > > > + if (!pages) > > > + return -ENOMEM; > > > + > > > + if (!pgoff) { > > > + unsigned long meta_page_padding; > > > + > > > + pages[p++] = virt_to_page(cpu_buffer->meta_page); > > > + > > > + /* > > > + * Pad with the zero-page to align the meta-page with the > > > + * sub-buffers. > > > + */ > > > + meta_page_padding = subbuf_pages - 1; > > > + while (meta_page_padding-- && p < nr_pages) > > > + pages[p++] = ZERO_PAGE(0); > > > > Using the shared zeropage in a MAP_SHARED mapping that is neither VM_IO > > nor VM_PFNMAP can be problematic. So we really need patch #3 logic to > > use VM_PFNMAP. > > > > It would be cleaner/more obvious if these VMA flag setting would reside tracing_buffers_mmap() sets both VM_IO and VM_PFNMAP. But, yeah as vma is already passed to ring_buffer_map(). The flags could be set here as this function is what sets the requirements really. > > here, if possible. > > > > > + } else { > > > + /* Skip the meta-page */ > > > + pgoff -= subbuf_pages; > > > + > > > + s += pgoff / subbuf_pages; > > > + } > > > + > > > + while (s < nr_subbufs && p < nr_pages) { > > > + struct page *page; > > > + > > > + foreach_subbuf_page(subbuf_order, cpu_buffer->subbuf_ids[s], page) { > > > + if (p >= nr_pages) > > > + break; > > > + > > > + pages[p++] = page; > > > + } > > > + s++; > > > + } > > > + > > > + err = vm_insert_pages(vma, vma->vm_start, pages, &nr_pages); > > > > I think Linus suggested it ("avoid all the sub-page ref-counts entirely > > by using VM_PFNMAP, and use vm_insert_pages()"), but ... > > vm_insert_pages() will: > > * Mess with mapcounts > > * Mess with refcounts > > > > See > > insert_pages()->insert_page_in_batch_locked()->insert_page_into_pte_locked(). > > > > So we'll mess with the mapcount and refcount of the shared zeropage ... > > hmmmm > > > > If I am not wrong, vm_normal_page() would not return the shared zeropage > > in case we don't have CONFIG_ARCH_HAS_PTE_SPECIAL ... so > > unmap()->...->zap_present_ptes() would not decrement the refcount and we > > could overflow it. ... we also shouldn't ever mess with the mapcount of > > the shared zeropage in the first place. > > > > vm_insert_page() is clearer on that: "This allows drivers to insert > > individual pages they've allocated into a user vma". You didn't allocate > > the shared zeropage. > > > > I'm wondering if we even expect VM_MIXEDMAP and VM_PFNMAP to be set at > > the same time? vm_insert_pages() would BUG_ON in case VM_PFNMAP is > > already set and it would set VM_MIXEDMAP ... similarly > > vmf_insert_pfn_prot() would not be happy about that at all ... > > > > BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == > > (VM_PFNMAP|VM_MIXEDMAP)); > > > > ... remap_pfn_range() is used by io_uring_mmap for a similar purpose. > > But it only supports a single PFN range at a time and requires the > > caller to handle refcounting of pages. > > > > It's getting late in Germany, so I might be missing something; but using > > the shared zeropage here might be a problem. > > > > I was just about to write code to cleanly support vm_insert_pages() to > consume zeropages ... when I started to wonder about something else: Alternatively, we could at the moment allocate a meta-page of the same size than the subbufs? (of course the downside is to waste a bit of memory) > > > + if (vma->vm_flags & VM_WRITE || vma->vm_flags & VM_EXEC || > + !(vma->vm_flags & VM_MAYSHARE)) > + return -EPERM; > + > > You disallow writable mappings. But what about using mprotect() afterwards > to allow for write permissions? > > Likely you'd want to remove VM_MAYWRITE from the flags, otherwise mprotect() > might alter succeed. The vm_flags_mod below is clearing VM_MAYWRITE already. Isn't it enough? > > -- > Cheers, > > David / dhildenb >
On 22.04.24 20:20, Vincent Donnefort wrote: > Hi David, > > Thanks for having a look, very much appreciated! > > On Mon, Apr 22, 2024 at 11:27:11AM +0200, David Hildenbrand wrote: >> On 19.04.24 20:25, David Hildenbrand wrote: >>> On 06.04.24 19:36, Vincent Donnefort wrote: >>>> In preparation for allowing the user-space to map a ring-buffer, add >>>> a set of mapping functions: >>>> >>>> ring_buffer_{map,unmap}() >>>> >>>> And controls on the ring-buffer: >>>> >>>> ring_buffer_map_get_reader() /* swap reader and head */ >>>> >>>> Mapping the ring-buffer also involves: >>>> >>>> A unique ID for each subbuf of the ring-buffer, currently they are >>>> only identified through their in-kernel VA. >>>> >>>> A meta-page, where are stored ring-buffer statistics and a >>>> description for the current reader >>>> >>>> The linear mapping exposes the meta-page, and each subbuf of the >>>> ring-buffer, ordered following their unique ID, assigned during the >>>> first mapping. >>>> >>>> Once mapped, no subbuf can get in or out of the ring-buffer: the buffer >>>> size will remain unmodified and the splice enabling functions will in >>>> reality simply memcpy the data instead of swapping subbufs. >>>> >>>> CC: <linux-mm@kvack.org> >>>> Signed-off-by: Vincent Donnefort <vdonnefort@google.com> >>>> >>>> diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h >>>> index dc5ae4e96aee..96d2140b471e 100644 >>>> --- a/include/linux/ring_buffer.h >>>> +++ b/include/linux/ring_buffer.h >>>> @@ -6,6 +6,8 @@ >>>> #include <linux/seq_file.h> >>>> #include <linux/poll.h> >>>> +#include <uapi/linux/trace_mmap.h> >>>> + >>>> struct trace_buffer; >>>> struct ring_buffer_iter; >>>> @@ -223,4 +225,8 @@ int trace_rb_cpu_prepare(unsigned int cpu, struct hlist_node *node); >>>> #define trace_rb_cpu_prepare NULL >>>> #endif >>>> +int ring_buffer_map(struct trace_buffer *buffer, int cpu, >>>> + struct vm_area_struct *vma); >>>> +int ring_buffer_unmap(struct trace_buffer *buffer, int cpu); >>>> +int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu); >>>> #endif /* _LINUX_RING_BUFFER_H */ >>>> diff --git a/include/uapi/linux/trace_mmap.h b/include/uapi/linux/trace_mmap.h >>>> new file mode 100644 >>>> index 000000000000..ffcd8dfcaa4f >>>> --- /dev/null >>>> +++ b/include/uapi/linux/trace_mmap.h >>>> @@ -0,0 +1,46 @@ >>>> +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ >>>> +#ifndef _TRACE_MMAP_H_ >>>> +#define _TRACE_MMAP_H_ >>>> + >>>> +#include <linux/types.h> >>>> + >>>> +/** >>>> + * struct trace_buffer_meta - Ring-buffer Meta-page description >>>> + * @meta_page_size: Size of this meta-page. >>>> + * @meta_struct_len: Size of this structure. >>>> + * @subbuf_size: Size of each sub-buffer. >>>> + * @nr_subbufs: Number of subbfs in the ring-buffer, including the reader. >>>> + * @reader.lost_events: Number of events lost at the time of the reader swap. >>>> + * @reader.id: subbuf ID of the current reader. ID range [0 : @nr_subbufs - 1] >>>> + * @reader.read: Number of bytes read on the reader subbuf. >>>> + * @flags: Placeholder for now, 0 until new features are supported. >>>> + * @entries: Number of entries in the ring-buffer. >>>> + * @overrun: Number of entries lost in the ring-buffer. >>>> + * @read: Number of entries that have been read. >>>> + * @Reserved1: Reserved for future use. >>>> + * @Reserved2: Reserved for future use. >>>> + */ >>>> +struct trace_buffer_meta { >>>> + __u32 meta_page_size; >>>> + __u32 meta_struct_len; >>>> + >>>> + __u32 subbuf_size; >>>> + __u32 nr_subbufs; >>>> + >>>> + struct { >>>> + __u64 lost_events; >>>> + __u32 id; >>>> + __u32 read; >>>> + } reader; >>>> + >>>> + __u64 flags; >>>> + >>>> + __u64 entries; >>>> + __u64 overrun; >>>> + __u64 read; >>>> + >>>> + __u64 Reserved1; >>>> + __u64 Reserved2; >>>> +}; >>>> + >>>> +#endif /* _TRACE_MMAP_H_ */ >>>> diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c >>>> index cc9ebe593571..793ecc454039 100644 >>>> --- a/kernel/trace/ring_buffer.c >>>> +++ b/kernel/trace/ring_buffer.c >>>> @@ -9,6 +9,7 @@ >>>> #include <linux/ring_buffer.h> >>>> #include <linux/trace_clock.h> >>>> #include <linux/sched/clock.h> >>>> +#include <linux/cacheflush.h> >>>> #include <linux/trace_seq.h> >>>> #include <linux/spinlock.h> >>>> #include <linux/irq_work.h> >>>> @@ -26,6 +27,7 @@ >>>> #include <linux/list.h> >>>> #include <linux/cpu.h> >>>> #include <linux/oom.h> >>>> +#include <linux/mm.h> >>>> #include <asm/local64.h> >>>> #include <asm/local.h> >>>> @@ -338,6 +340,7 @@ struct buffer_page { >>>> local_t entries; /* entries on this page */ >>>> unsigned long real_end; /* real end of data */ >>>> unsigned order; /* order of the page */ >>>> + u32 id; /* ID for external mapping */ >>>> struct buffer_data_page *page; /* Actual data page */ >>>> }; >>>> @@ -484,6 +487,12 @@ struct ring_buffer_per_cpu { >>>> u64 read_stamp; >>>> /* pages removed since last reset */ >>>> unsigned long pages_removed; >>>> + >>>> + unsigned int mapped; >>>> + struct mutex mapping_lock; >>>> + unsigned long *subbuf_ids; /* ID to subbuf VA */ >>>> + struct trace_buffer_meta *meta_page; >>>> + >>>> /* ring buffer pages to update, > 0 to add, < 0 to remove */ >>>> long nr_pages_to_update; >>>> struct list_head new_pages; /* new pages to add */ >>>> @@ -1599,6 +1608,7 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) >>>> init_irq_work(&cpu_buffer->irq_work.work, rb_wake_up_waiters); >>>> init_waitqueue_head(&cpu_buffer->irq_work.waiters); >>>> init_waitqueue_head(&cpu_buffer->irq_work.full_waiters); >>>> + mutex_init(&cpu_buffer->mapping_lock); >>>> bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), >>>> GFP_KERNEL, cpu_to_node(cpu)); >>>> @@ -1789,8 +1799,6 @@ bool ring_buffer_time_stamp_abs(struct trace_buffer *buffer) >>>> return buffer->time_stamp_abs; >>>> } >>>> -static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer); >>>> - >>>> static inline unsigned long rb_page_entries(struct buffer_page *bpage) >>>> { >>>> return local_read(&bpage->entries) & RB_WRITE_MASK; >>>> @@ -5211,6 +5219,22 @@ static void rb_clear_buffer_page(struct buffer_page *page) >>>> page->read = 0; >>>> } >>>> +static void rb_update_meta_page(struct ring_buffer_per_cpu *cpu_buffer) >>>> +{ >>>> + struct trace_buffer_meta *meta = cpu_buffer->meta_page; >>>> + >>>> + meta->reader.read = cpu_buffer->reader_page->read; >>>> + meta->reader.id = cpu_buffer->reader_page->id; >>>> + meta->reader.lost_events = cpu_buffer->lost_events; >>>> + >>>> + meta->entries = local_read(&cpu_buffer->entries); >>>> + meta->overrun = local_read(&cpu_buffer->overrun); >>>> + meta->read = cpu_buffer->read; >>>> + >>>> + /* Some archs do not have data cache coherency between kernel and user-space */ >>>> + flush_dcache_folio(virt_to_folio(cpu_buffer->meta_page)); >>>> +} >>>> + >>>> static void >>>> rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) >>>> { >>>> @@ -5255,6 +5279,9 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) >>>> cpu_buffer->lost_events = 0; >>>> cpu_buffer->last_overrun = 0; >>>> + if (cpu_buffer->mapped) >>>> + rb_update_meta_page(cpu_buffer); >>>> + >>>> rb_head_page_activate(cpu_buffer); >>>> cpu_buffer->pages_removed = 0; >>>> } >>>> @@ -5469,6 +5496,12 @@ int ring_buffer_swap_cpu(struct trace_buffer *buffer_a, >>>> cpu_buffer_a = buffer_a->buffers[cpu]; >>>> cpu_buffer_b = buffer_b->buffers[cpu]; >>>> + /* It's up to the callers to not try to swap mapped buffers */ >>>> + if (WARN_ON_ONCE(cpu_buffer_a->mapped || cpu_buffer_b->mapped)) { >>>> + ret = -EBUSY; >>>> + goto out; >>>> + } >>>> + >>>> /* At least make sure the two buffers are somewhat the same */ >>>> if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages) >>>> goto out; >>>> @@ -5733,7 +5766,8 @@ int ring_buffer_read_page(struct trace_buffer *buffer, >>>> * Otherwise, we can simply swap the page with the one passed in. >>>> */ >>>> if (read || (len < (commit - read)) || >>>> - cpu_buffer->reader_page == cpu_buffer->commit_page) { >>>> + cpu_buffer->reader_page == cpu_buffer->commit_page || >>>> + cpu_buffer->mapped) { >>>> struct buffer_data_page *rpage = cpu_buffer->reader_page->page; >>>> unsigned int rpos = read; >>>> unsigned int pos = 0; >>>> @@ -5956,6 +5990,11 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) >>>> cpu_buffer = buffer->buffers[cpu]; >>>> + if (cpu_buffer->mapped) { >>>> + err = -EBUSY; >>>> + goto error; >>>> + } >>>> + >>>> /* Update the number of pages to match the new size */ >>>> nr_pages = old_size * buffer->buffers[cpu]->nr_pages; >>>> nr_pages = DIV_ROUND_UP(nr_pages, buffer->subbuf_size); >>>> @@ -6057,6 +6096,358 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) >>>> } >>>> EXPORT_SYMBOL_GPL(ring_buffer_subbuf_order_set); >>>> +static int rb_alloc_meta_page(struct ring_buffer_per_cpu *cpu_buffer) >>>> +{ >>>> + struct page *page; >>>> + >>>> + if (cpu_buffer->meta_page) >>>> + return 0; >>>> + >>>> + page = alloc_page(GFP_USER | __GFP_ZERO); >>>> + if (!page) >>>> + return -ENOMEM; >>>> + >>>> + cpu_buffer->meta_page = page_to_virt(page); >>>> + >>>> + return 0; >>>> +} >>>> + >>>> +static void rb_free_meta_page(struct ring_buffer_per_cpu *cpu_buffer) >>>> +{ >>>> + unsigned long addr = (unsigned long)cpu_buffer->meta_page; >>>> + >>>> + free_page(addr); >>>> + cpu_buffer->meta_page = NULL; >>>> +} >>>> + >>>> +static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer, >>>> + unsigned long *subbuf_ids) >>>> +{ >>>> + struct trace_buffer_meta *meta = cpu_buffer->meta_page; >>>> + unsigned int nr_subbufs = cpu_buffer->nr_pages + 1; >>>> + struct buffer_page *first_subbuf, *subbuf; >>>> + int id = 0; >>>> + >>>> + subbuf_ids[id] = (unsigned long)cpu_buffer->reader_page->page; >>>> + cpu_buffer->reader_page->id = id++; >>>> + >>>> + first_subbuf = subbuf = rb_set_head_page(cpu_buffer); >>>> + do { >>>> + if (WARN_ON(id >= nr_subbufs)) >>>> + break; >>>> + >>>> + subbuf_ids[id] = (unsigned long)subbuf->page; >>>> + subbuf->id = id; >>>> + >>>> + rb_inc_page(&subbuf); >>>> + id++; >>>> + } while (subbuf != first_subbuf); >>>> + >>>> + /* install subbuf ID to kern VA translation */ >>>> + cpu_buffer->subbuf_ids = subbuf_ids; >>>> + >>>> + /* __rb_map_vma() pads the meta-page to align it with the sub-buffers */ >>>> + meta->meta_page_size = PAGE_SIZE << cpu_buffer->buffer->subbuf_order; >>>> + meta->meta_struct_len = sizeof(*meta); >>>> + meta->nr_subbufs = nr_subbufs; >>>> + meta->subbuf_size = cpu_buffer->buffer->subbuf_size + BUF_PAGE_HDR_SIZE; >>>> + >>>> + rb_update_meta_page(cpu_buffer); >>>> +} >>>> + >>>> +static struct ring_buffer_per_cpu * >>>> +rb_get_mapped_buffer(struct trace_buffer *buffer, int cpu) >>>> +{ >>>> + struct ring_buffer_per_cpu *cpu_buffer; >>>> + >>>> + if (!cpumask_test_cpu(cpu, buffer->cpumask)) >>>> + return ERR_PTR(-EINVAL); >>>> + >>>> + cpu_buffer = buffer->buffers[cpu]; >>>> + >>>> + mutex_lock(&cpu_buffer->mapping_lock); >>>> + >>>> + if (!cpu_buffer->mapped) { >>>> + mutex_unlock(&cpu_buffer->mapping_lock); >>>> + return ERR_PTR(-ENODEV); >>>> + } >>>> + >>>> + return cpu_buffer; >>>> +} >>>> + >>>> +static void rb_put_mapped_buffer(struct ring_buffer_per_cpu *cpu_buffer) >>>> +{ >>>> + mutex_unlock(&cpu_buffer->mapping_lock); >>>> +} >>>> + >>>> +/* >>>> + * Fast-path for rb_buffer_(un)map(). Called whenever the meta-page doesn't need >>>> + * to be set-up or torn-down. >>>> + */ >>>> +static int __rb_inc_dec_mapped(struct ring_buffer_per_cpu *cpu_buffer, >>>> + bool inc) >>>> +{ >>>> + unsigned long flags; >>>> + >>>> + lockdep_assert_held(&cpu_buffer->mapping_lock); >>>> + >>>> + if (inc && cpu_buffer->mapped == UINT_MAX) >>>> + return -EBUSY; >>>> + >>>> + if (WARN_ON(!inc && cpu_buffer->mapped == 0)) >>>> + return -EINVAL; >>>> + >>>> + mutex_lock(&cpu_buffer->buffer->mutex); >>>> + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); >>>> + >>>> + if (inc) >>>> + cpu_buffer->mapped++; >>>> + else >>>> + cpu_buffer->mapped--; >>>> + >>>> + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); >>>> + mutex_unlock(&cpu_buffer->buffer->mutex); >>>> + >>>> + return 0; >>>> +} >>>> + >>>> +#define subbuf_page(off, start) \ >>>> + virt_to_page((void *)((start) + ((off) << PAGE_SHIFT))) >>>> + >>>> +#define foreach_subbuf_page(sub_order, start, page) \ >>>> + page = subbuf_page(0, (start)); \ >>>> + for (int __off = 0; __off < (1 << (sub_order)); \ >>>> + __off++, page = subbuf_page(__off, (start))) >>>> + >>>> +/* >>>> + * +--------------+ pgoff == 0 >>>> + * | meta page | >>>> + * +--------------+ pgoff == 1 >>>> + * | 000000000 | >>>> + * +--------------+ pgoff == (1 << subbuf_order) >>>> + * | subbuffer 0 | >>>> + * | | >>>> + * +--------------+ pgoff == (2 * (1 << subbuf_order)) >>>> + * | subbuffer 1 | >>>> + * | | >>>> + * ... >>>> + */ >>>> +static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer, >>>> + struct vm_area_struct *vma) >>>> +{ >>>> + unsigned long nr_subbufs, nr_pages, vma_pages, pgoff = vma->vm_pgoff; >>>> + unsigned int subbuf_pages, subbuf_order; >>>> + struct page **pages; >>>> + int p = 0, s = 0; >>>> + int err; >>>> + >>>> + lockdep_assert_held(&cpu_buffer->mapping_lock); >>>> + >>>> + subbuf_order = cpu_buffer->buffer->subbuf_order; >>>> + subbuf_pages = 1 << subbuf_order; >>>> + >>>> + if (subbuf_order && pgoff % subbuf_pages) >>>> + return -EINVAL; >>>> + >>>> + nr_subbufs = cpu_buffer->nr_pages + 1; >>>> + nr_pages = ((nr_subbufs + 1) << subbuf_order) - pgoff; >>>> + >>>> + vma_pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; >>>> + if (!vma_pages || vma_pages > nr_pages) >>>> + return -EINVAL; >>>> + >>>> + nr_pages = vma_pages; >>>> + >>>> + pages = kcalloc(nr_pages, sizeof(*pages), GFP_KERNEL); >>>> + if (!pages) >>>> + return -ENOMEM; >>>> + >>>> + if (!pgoff) { >>>> + unsigned long meta_page_padding; >>>> + >>>> + pages[p++] = virt_to_page(cpu_buffer->meta_page); >>>> + >>>> + /* >>>> + * Pad with the zero-page to align the meta-page with the >>>> + * sub-buffers. >>>> + */ >>>> + meta_page_padding = subbuf_pages - 1; >>>> + while (meta_page_padding-- && p < nr_pages) >>>> + pages[p++] = ZERO_PAGE(0); >>> >>> Using the shared zeropage in a MAP_SHARED mapping that is neither VM_IO >>> nor VM_PFNMAP can be problematic. So we really need patch #3 logic to >>> use VM_PFNMAP. >>> >>> It would be cleaner/more obvious if these VMA flag setting would reside > > tracing_buffers_mmap() sets both VM_IO and VM_PFNMAP. But, yeah as vma is > already passed to ring_buffer_map(). The flags could be set here as this > function is what sets the requirements really. > >>> here, if possible. >>> >>>> + } else { >>>> + /* Skip the meta-page */ >>>> + pgoff -= subbuf_pages; >>>> + >>>> + s += pgoff / subbuf_pages; >>>> + } >>>> + >>>> + while (s < nr_subbufs && p < nr_pages) { >>>> + struct page *page; >>>> + >>>> + foreach_subbuf_page(subbuf_order, cpu_buffer->subbuf_ids[s], page) { >>>> + if (p >= nr_pages) >>>> + break; >>>> + >>>> + pages[p++] = page; >>>> + } >>>> + s++; >>>> + } >>>> + >>>> + err = vm_insert_pages(vma, vma->vm_start, pages, &nr_pages); >>> >>> I think Linus suggested it ("avoid all the sub-page ref-counts entirely >>> by using VM_PFNMAP, and use vm_insert_pages()"), but ... >>> vm_insert_pages() will: >>> * Mess with mapcounts >>> * Mess with refcounts >>> >>> See >>> insert_pages()->insert_page_in_batch_locked()->insert_page_into_pte_locked(). >>> >>> So we'll mess with the mapcount and refcount of the shared zeropage ... >>> hmmmm >>> >>> If I am not wrong, vm_normal_page() would not return the shared zeropage >>> in case we don't have CONFIG_ARCH_HAS_PTE_SPECIAL ... so >>> unmap()->...->zap_present_ptes() would not decrement the refcount and we >>> could overflow it. ... we also shouldn't ever mess with the mapcount of >>> the shared zeropage in the first place. >>> >>> vm_insert_page() is clearer on that: "This allows drivers to insert >>> individual pages they've allocated into a user vma". You didn't allocate >>> the shared zeropage. >>> >>> I'm wondering if we even expect VM_MIXEDMAP and VM_PFNMAP to be set at >>> the same time? vm_insert_pages() would BUG_ON in case VM_PFNMAP is >>> already set and it would set VM_MIXEDMAP ... similarly >>> vmf_insert_pfn_prot() would not be happy about that at all ... >>> >>> BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == >>> (VM_PFNMAP|VM_MIXEDMAP)); >>> >>> ... remap_pfn_range() is used by io_uring_mmap for a similar purpose. >>> But it only supports a single PFN range at a time and requires the >>> caller to handle refcounting of pages. >>> >>> It's getting late in Germany, so I might be missing something; but using >>> the shared zeropage here might be a problem. >>> >> >> I was just about to write code to cleanly support vm_insert_pages() to >> consume zeropages ... when I started to wonder about something else: > > Alternatively, we could at the moment allocate a meta-page of the same size than > the subbufs? (of course the downside is to waste a bit of memory) Supporting the shared zeropage should be possible. We just have to be careful that no other code can accidentially set it writable. I'll have to further think about that (not just your user, but making it sane to use by new code as well). So far, we primarily only have shared zeropages in the MAP_PRIVATE mappings. Dax is a corner case where we have them in MAP_SHARED mappings. PFNMAP would not be problematic, but MIXEDMAP is. (again, I am not 100% sure about combining both ...) I'll further think about that one, and try crafting a reasonable patch. Could we start with no shared zeropage and implement that as an optimization on top? > >> >> >> + if (vma->vm_flags & VM_WRITE || vma->vm_flags & VM_EXEC || >> + !(vma->vm_flags & VM_MAYSHARE)) >> + return -EPERM; >> + >> >> You disallow writable mappings. But what about using mprotect() afterwards >> to allow for write permissions? >> >> Likely you'd want to remove VM_MAYWRITE from the flags, otherwise mprotect() >> might alter succeed. > > The vm_flags_mod below is clearing VM_MAYWRITE already. Isn't it enough? Oh, maybe I missed that!
On Mon, Apr 22, 2024 at 08:27:17PM +0200, David Hildenbrand wrote: > On 22.04.24 20:20, Vincent Donnefort wrote: > > Hi David, > > > > Thanks for having a look, very much appreciated! > > > > On Mon, Apr 22, 2024 at 11:27:11AM +0200, David Hildenbrand wrote: > > > On 19.04.24 20:25, David Hildenbrand wrote: > > > > On 06.04.24 19:36, Vincent Donnefort wrote: > > > > > In preparation for allowing the user-space to map a ring-buffer, add > > > > > a set of mapping functions: > > > > > > > > > > ring_buffer_{map,unmap}() > > > > > > > > > > And controls on the ring-buffer: > > > > > > > > > > ring_buffer_map_get_reader() /* swap reader and head */ > > > > > > > > > > Mapping the ring-buffer also involves: > > > > > > > > > > A unique ID for each subbuf of the ring-buffer, currently they are > > > > > only identified through their in-kernel VA. > > > > > > > > > > A meta-page, where are stored ring-buffer statistics and a > > > > > description for the current reader > > > > > > > > > > The linear mapping exposes the meta-page, and each subbuf of the > > > > > ring-buffer, ordered following their unique ID, assigned during the > > > > > first mapping. > > > > > > > > > > Once mapped, no subbuf can get in or out of the ring-buffer: the buffer > > > > > size will remain unmodified and the splice enabling functions will in > > > > > reality simply memcpy the data instead of swapping subbufs. > > > > > > > > > > CC: <linux-mm@kvack.org> > > > > > Signed-off-by: Vincent Donnefort <vdonnefort@google.com> > > > > > > > > > > diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h > > > > > index dc5ae4e96aee..96d2140b471e 100644 > > > > > --- a/include/linux/ring_buffer.h > > > > > +++ b/include/linux/ring_buffer.h > > > > > @@ -6,6 +6,8 @@ > > > > > #include <linux/seq_file.h> > > > > > #include <linux/poll.h> > > > > > +#include <uapi/linux/trace_mmap.h> > > > > > + > > > > > struct trace_buffer; > > > > > struct ring_buffer_iter; > > > > > @@ -223,4 +225,8 @@ int trace_rb_cpu_prepare(unsigned int cpu, struct hlist_node *node); > > > > > #define trace_rb_cpu_prepare NULL > > > > > #endif > > > > > +int ring_buffer_map(struct trace_buffer *buffer, int cpu, > > > > > + struct vm_area_struct *vma); > > > > > +int ring_buffer_unmap(struct trace_buffer *buffer, int cpu); > > > > > +int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu); > > > > > #endif /* _LINUX_RING_BUFFER_H */ > > > > > diff --git a/include/uapi/linux/trace_mmap.h b/include/uapi/linux/trace_mmap.h > > > > > new file mode 100644 > > > > > index 000000000000..ffcd8dfcaa4f > > > > > --- /dev/null > > > > > +++ b/include/uapi/linux/trace_mmap.h > > > > > @@ -0,0 +1,46 @@ > > > > > +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ > > > > > +#ifndef _TRACE_MMAP_H_ > > > > > +#define _TRACE_MMAP_H_ > > > > > + > > > > > +#include <linux/types.h> > > > > > + > > > > > +/** > > > > > + * struct trace_buffer_meta - Ring-buffer Meta-page description > > > > > + * @meta_page_size: Size of this meta-page. > > > > > + * @meta_struct_len: Size of this structure. > > > > > + * @subbuf_size: Size of each sub-buffer. > > > > > + * @nr_subbufs: Number of subbfs in the ring-buffer, including the reader. > > > > > + * @reader.lost_events: Number of events lost at the time of the reader swap. > > > > > + * @reader.id: subbuf ID of the current reader. ID range [0 : @nr_subbufs - 1] > > > > > + * @reader.read: Number of bytes read on the reader subbuf. > > > > > + * @flags: Placeholder for now, 0 until new features are supported. > > > > > + * @entries: Number of entries in the ring-buffer. > > > > > + * @overrun: Number of entries lost in the ring-buffer. > > > > > + * @read: Number of entries that have been read. > > > > > + * @Reserved1: Reserved for future use. > > > > > + * @Reserved2: Reserved for future use. > > > > > + */ > > > > > +struct trace_buffer_meta { > > > > > + __u32 meta_page_size; > > > > > + __u32 meta_struct_len; > > > > > + > > > > > + __u32 subbuf_size; > > > > > + __u32 nr_subbufs; > > > > > + > > > > > + struct { > > > > > + __u64 lost_events; > > > > > + __u32 id; > > > > > + __u32 read; > > > > > + } reader; > > > > > + > > > > > + __u64 flags; > > > > > + > > > > > + __u64 entries; > > > > > + __u64 overrun; > > > > > + __u64 read; > > > > > + > > > > > + __u64 Reserved1; > > > > > + __u64 Reserved2; > > > > > +}; > > > > > + > > > > > +#endif /* _TRACE_MMAP_H_ */ > > > > > diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c > > > > > index cc9ebe593571..793ecc454039 100644 > > > > > --- a/kernel/trace/ring_buffer.c > > > > > +++ b/kernel/trace/ring_buffer.c > > > > > @@ -9,6 +9,7 @@ > > > > > #include <linux/ring_buffer.h> > > > > > #include <linux/trace_clock.h> > > > > > #include <linux/sched/clock.h> > > > > > +#include <linux/cacheflush.h> > > > > > #include <linux/trace_seq.h> > > > > > #include <linux/spinlock.h> > > > > > #include <linux/irq_work.h> > > > > > @@ -26,6 +27,7 @@ > > > > > #include <linux/list.h> > > > > > #include <linux/cpu.h> > > > > > #include <linux/oom.h> > > > > > +#include <linux/mm.h> > > > > > #include <asm/local64.h> > > > > > #include <asm/local.h> > > > > > @@ -338,6 +340,7 @@ struct buffer_page { > > > > > local_t entries; /* entries on this page */ > > > > > unsigned long real_end; /* real end of data */ > > > > > unsigned order; /* order of the page */ > > > > > + u32 id; /* ID for external mapping */ > > > > > struct buffer_data_page *page; /* Actual data page */ > > > > > }; > > > > > @@ -484,6 +487,12 @@ struct ring_buffer_per_cpu { > > > > > u64 read_stamp; > > > > > /* pages removed since last reset */ > > > > > unsigned long pages_removed; > > > > > + > > > > > + unsigned int mapped; > > > > > + struct mutex mapping_lock; > > > > > + unsigned long *subbuf_ids; /* ID to subbuf VA */ > > > > > + struct trace_buffer_meta *meta_page; > > > > > + > > > > > /* ring buffer pages to update, > 0 to add, < 0 to remove */ > > > > > long nr_pages_to_update; > > > > > struct list_head new_pages; /* new pages to add */ > > > > > @@ -1599,6 +1608,7 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) > > > > > init_irq_work(&cpu_buffer->irq_work.work, rb_wake_up_waiters); > > > > > init_waitqueue_head(&cpu_buffer->irq_work.waiters); > > > > > init_waitqueue_head(&cpu_buffer->irq_work.full_waiters); > > > > > + mutex_init(&cpu_buffer->mapping_lock); > > > > > bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), > > > > > GFP_KERNEL, cpu_to_node(cpu)); > > > > > @@ -1789,8 +1799,6 @@ bool ring_buffer_time_stamp_abs(struct trace_buffer *buffer) > > > > > return buffer->time_stamp_abs; > > > > > } > > > > > -static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer); > > > > > - > > > > > static inline unsigned long rb_page_entries(struct buffer_page *bpage) > > > > > { > > > > > return local_read(&bpage->entries) & RB_WRITE_MASK; > > > > > @@ -5211,6 +5219,22 @@ static void rb_clear_buffer_page(struct buffer_page *page) > > > > > page->read = 0; > > > > > } > > > > > +static void rb_update_meta_page(struct ring_buffer_per_cpu *cpu_buffer) > > > > > +{ > > > > > + struct trace_buffer_meta *meta = cpu_buffer->meta_page; > > > > > + > > > > > + meta->reader.read = cpu_buffer->reader_page->read; > > > > > + meta->reader.id = cpu_buffer->reader_page->id; > > > > > + meta->reader.lost_events = cpu_buffer->lost_events; > > > > > + > > > > > + meta->entries = local_read(&cpu_buffer->entries); > > > > > + meta->overrun = local_read(&cpu_buffer->overrun); > > > > > + meta->read = cpu_buffer->read; > > > > > + > > > > > + /* Some archs do not have data cache coherency between kernel and user-space */ > > > > > + flush_dcache_folio(virt_to_folio(cpu_buffer->meta_page)); > > > > > +} > > > > > + > > > > > static void > > > > > rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) > > > > > { > > > > > @@ -5255,6 +5279,9 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) > > > > > cpu_buffer->lost_events = 0; > > > > > cpu_buffer->last_overrun = 0; > > > > > + if (cpu_buffer->mapped) > > > > > + rb_update_meta_page(cpu_buffer); > > > > > + > > > > > rb_head_page_activate(cpu_buffer); > > > > > cpu_buffer->pages_removed = 0; > > > > > } > > > > > @@ -5469,6 +5496,12 @@ int ring_buffer_swap_cpu(struct trace_buffer *buffer_a, > > > > > cpu_buffer_a = buffer_a->buffers[cpu]; > > > > > cpu_buffer_b = buffer_b->buffers[cpu]; > > > > > + /* It's up to the callers to not try to swap mapped buffers */ > > > > > + if (WARN_ON_ONCE(cpu_buffer_a->mapped || cpu_buffer_b->mapped)) { > > > > > + ret = -EBUSY; > > > > > + goto out; > > > > > + } > > > > > + > > > > > /* At least make sure the two buffers are somewhat the same */ > > > > > if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages) > > > > > goto out; > > > > > @@ -5733,7 +5766,8 @@ int ring_buffer_read_page(struct trace_buffer *buffer, > > > > > * Otherwise, we can simply swap the page with the one passed in. > > > > > */ > > > > > if (read || (len < (commit - read)) || > > > > > - cpu_buffer->reader_page == cpu_buffer->commit_page) { > > > > > + cpu_buffer->reader_page == cpu_buffer->commit_page || > > > > > + cpu_buffer->mapped) { > > > > > struct buffer_data_page *rpage = cpu_buffer->reader_page->page; > > > > > unsigned int rpos = read; > > > > > unsigned int pos = 0; > > > > > @@ -5956,6 +5990,11 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) > > > > > cpu_buffer = buffer->buffers[cpu]; > > > > > + if (cpu_buffer->mapped) { > > > > > + err = -EBUSY; > > > > > + goto error; > > > > > + } > > > > > + > > > > > /* Update the number of pages to match the new size */ > > > > > nr_pages = old_size * buffer->buffers[cpu]->nr_pages; > > > > > nr_pages = DIV_ROUND_UP(nr_pages, buffer->subbuf_size); > > > > > @@ -6057,6 +6096,358 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) > > > > > } > > > > > EXPORT_SYMBOL_GPL(ring_buffer_subbuf_order_set); > > > > > +static int rb_alloc_meta_page(struct ring_buffer_per_cpu *cpu_buffer) > > > > > +{ > > > > > + struct page *page; > > > > > + > > > > > + if (cpu_buffer->meta_page) > > > > > + return 0; > > > > > + > > > > > + page = alloc_page(GFP_USER | __GFP_ZERO); > > > > > + if (!page) > > > > > + return -ENOMEM; > > > > > + > > > > > + cpu_buffer->meta_page = page_to_virt(page); > > > > > + > > > > > + return 0; > > > > > +} > > > > > + > > > > > +static void rb_free_meta_page(struct ring_buffer_per_cpu *cpu_buffer) > > > > > +{ > > > > > + unsigned long addr = (unsigned long)cpu_buffer->meta_page; > > > > > + > > > > > + free_page(addr); > > > > > + cpu_buffer->meta_page = NULL; > > > > > +} > > > > > + > > > > > +static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer, > > > > > + unsigned long *subbuf_ids) > > > > > +{ > > > > > + struct trace_buffer_meta *meta = cpu_buffer->meta_page; > > > > > + unsigned int nr_subbufs = cpu_buffer->nr_pages + 1; > > > > > + struct buffer_page *first_subbuf, *subbuf; > > > > > + int id = 0; > > > > > + > > > > > + subbuf_ids[id] = (unsigned long)cpu_buffer->reader_page->page; > > > > > + cpu_buffer->reader_page->id = id++; > > > > > + > > > > > + first_subbuf = subbuf = rb_set_head_page(cpu_buffer); > > > > > + do { > > > > > + if (WARN_ON(id >= nr_subbufs)) > > > > > + break; > > > > > + > > > > > + subbuf_ids[id] = (unsigned long)subbuf->page; > > > > > + subbuf->id = id; > > > > > + > > > > > + rb_inc_page(&subbuf); > > > > > + id++; > > > > > + } while (subbuf != first_subbuf); > > > > > + > > > > > + /* install subbuf ID to kern VA translation */ > > > > > + cpu_buffer->subbuf_ids = subbuf_ids; > > > > > + > > > > > + /* __rb_map_vma() pads the meta-page to align it with the sub-buffers */ > > > > > + meta->meta_page_size = PAGE_SIZE << cpu_buffer->buffer->subbuf_order; > > > > > + meta->meta_struct_len = sizeof(*meta); > > > > > + meta->nr_subbufs = nr_subbufs; > > > > > + meta->subbuf_size = cpu_buffer->buffer->subbuf_size + BUF_PAGE_HDR_SIZE; > > > > > + > > > > > + rb_update_meta_page(cpu_buffer); > > > > > +} > > > > > + > > > > > +static struct ring_buffer_per_cpu * > > > > > +rb_get_mapped_buffer(struct trace_buffer *buffer, int cpu) > > > > > +{ > > > > > + struct ring_buffer_per_cpu *cpu_buffer; > > > > > + > > > > > + if (!cpumask_test_cpu(cpu, buffer->cpumask)) > > > > > + return ERR_PTR(-EINVAL); > > > > > + > > > > > + cpu_buffer = buffer->buffers[cpu]; > > > > > + > > > > > + mutex_lock(&cpu_buffer->mapping_lock); > > > > > + > > > > > + if (!cpu_buffer->mapped) { > > > > > + mutex_unlock(&cpu_buffer->mapping_lock); > > > > > + return ERR_PTR(-ENODEV); > > > > > + } > > > > > + > > > > > + return cpu_buffer; > > > > > +} > > > > > + > > > > > +static void rb_put_mapped_buffer(struct ring_buffer_per_cpu *cpu_buffer) > > > > > +{ > > > > > + mutex_unlock(&cpu_buffer->mapping_lock); > > > > > +} > > > > > + > > > > > +/* > > > > > + * Fast-path for rb_buffer_(un)map(). Called whenever the meta-page doesn't need > > > > > + * to be set-up or torn-down. > > > > > + */ > > > > > +static int __rb_inc_dec_mapped(struct ring_buffer_per_cpu *cpu_buffer, > > > > > + bool inc) > > > > > +{ > > > > > + unsigned long flags; > > > > > + > > > > > + lockdep_assert_held(&cpu_buffer->mapping_lock); > > > > > + > > > > > + if (inc && cpu_buffer->mapped == UINT_MAX) > > > > > + return -EBUSY; > > > > > + > > > > > + if (WARN_ON(!inc && cpu_buffer->mapped == 0)) > > > > > + return -EINVAL; > > > > > + > > > > > + mutex_lock(&cpu_buffer->buffer->mutex); > > > > > + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); > > > > > + > > > > > + if (inc) > > > > > + cpu_buffer->mapped++; > > > > > + else > > > > > + cpu_buffer->mapped--; > > > > > + > > > > > + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); > > > > > + mutex_unlock(&cpu_buffer->buffer->mutex); > > > > > + > > > > > + return 0; > > > > > +} > > > > > + > > > > > +#define subbuf_page(off, start) \ > > > > > + virt_to_page((void *)((start) + ((off) << PAGE_SHIFT))) > > > > > + > > > > > +#define foreach_subbuf_page(sub_order, start, page) \ > > > > > + page = subbuf_page(0, (start)); \ > > > > > + for (int __off = 0; __off < (1 << (sub_order)); \ > > > > > + __off++, page = subbuf_page(__off, (start))) > > > > > + > > > > > +/* > > > > > + * +--------------+ pgoff == 0 > > > > > + * | meta page | > > > > > + * +--------------+ pgoff == 1 > > > > > + * | 000000000 | > > > > > + * +--------------+ pgoff == (1 << subbuf_order) > > > > > + * | subbuffer 0 | > > > > > + * | | > > > > > + * +--------------+ pgoff == (2 * (1 << subbuf_order)) > > > > > + * | subbuffer 1 | > > > > > + * | | > > > > > + * ... > > > > > + */ > > > > > +static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer, > > > > > + struct vm_area_struct *vma) > > > > > +{ > > > > > + unsigned long nr_subbufs, nr_pages, vma_pages, pgoff = vma->vm_pgoff; > > > > > + unsigned int subbuf_pages, subbuf_order; > > > > > + struct page **pages; > > > > > + int p = 0, s = 0; > > > > > + int err; > > > > > + > > > > > + lockdep_assert_held(&cpu_buffer->mapping_lock); > > > > > + > > > > > + subbuf_order = cpu_buffer->buffer->subbuf_order; > > > > > + subbuf_pages = 1 << subbuf_order; > > > > > + > > > > > + if (subbuf_order && pgoff % subbuf_pages) > > > > > + return -EINVAL; > > > > > + > > > > > + nr_subbufs = cpu_buffer->nr_pages + 1; > > > > > + nr_pages = ((nr_subbufs + 1) << subbuf_order) - pgoff; > > > > > + > > > > > + vma_pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; > > > > > + if (!vma_pages || vma_pages > nr_pages) > > > > > + return -EINVAL; > > > > > + > > > > > + nr_pages = vma_pages; > > > > > + > > > > > + pages = kcalloc(nr_pages, sizeof(*pages), GFP_KERNEL); > > > > > + if (!pages) > > > > > + return -ENOMEM; > > > > > + > > > > > + if (!pgoff) { > > > > > + unsigned long meta_page_padding; > > > > > + > > > > > + pages[p++] = virt_to_page(cpu_buffer->meta_page); > > > > > + > > > > > + /* > > > > > + * Pad with the zero-page to align the meta-page with the > > > > > + * sub-buffers. > > > > > + */ > > > > > + meta_page_padding = subbuf_pages - 1; > > > > > + while (meta_page_padding-- && p < nr_pages) > > > > > + pages[p++] = ZERO_PAGE(0); > > > > > > > > Using the shared zeropage in a MAP_SHARED mapping that is neither VM_IO > > > > nor VM_PFNMAP can be problematic. So we really need patch #3 logic to > > > > use VM_PFNMAP. > > > > > > > > It would be cleaner/more obvious if these VMA flag setting would reside > > > > tracing_buffers_mmap() sets both VM_IO and VM_PFNMAP. But, yeah as vma is > > already passed to ring_buffer_map(). The flags could be set here as this > > function is what sets the requirements really. > > > > > > here, if possible. > > > > > > > > > + } else { > > > > > + /* Skip the meta-page */ > > > > > + pgoff -= subbuf_pages; > > > > > + > > > > > + s += pgoff / subbuf_pages; > > > > > + } > > > > > + > > > > > + while (s < nr_subbufs && p < nr_pages) { > > > > > + struct page *page; > > > > > + > > > > > + foreach_subbuf_page(subbuf_order, cpu_buffer->subbuf_ids[s], page) { > > > > > + if (p >= nr_pages) > > > > > + break; > > > > > + > > > > > + pages[p++] = page; > > > > > + } > > > > > + s++; > > > > > + } > > > > > + > > > > > + err = vm_insert_pages(vma, vma->vm_start, pages, &nr_pages); > > > > > > > > I think Linus suggested it ("avoid all the sub-page ref-counts entirely > > > > by using VM_PFNMAP, and use vm_insert_pages()"), but ... > > > > vm_insert_pages() will: > > > > * Mess with mapcounts > > > > * Mess with refcounts > > > > > > > > See > > > > insert_pages()->insert_page_in_batch_locked()->insert_page_into_pte_locked(). > > > > > > > > So we'll mess with the mapcount and refcount of the shared zeropage ... > > > > hmmmm > > > > > > > > If I am not wrong, vm_normal_page() would not return the shared zeropage > > > > in case we don't have CONFIG_ARCH_HAS_PTE_SPECIAL ... so > > > > unmap()->...->zap_present_ptes() would not decrement the refcount and we > > > > could overflow it. ... we also shouldn't ever mess with the mapcount of > > > > the shared zeropage in the first place. > > > > > > > > vm_insert_page() is clearer on that: "This allows drivers to insert > > > > individual pages they've allocated into a user vma". You didn't allocate > > > > the shared zeropage. > > > > > > > > I'm wondering if we even expect VM_MIXEDMAP and VM_PFNMAP to be set at > > > > the same time? vm_insert_pages() would BUG_ON in case VM_PFNMAP is > > > > already set and it would set VM_MIXEDMAP ... similarly > > > > vmf_insert_pfn_prot() would not be happy about that at all ... > > > > > > > > BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == > > > > (VM_PFNMAP|VM_MIXEDMAP)); > > > > > > > > ... remap_pfn_range() is used by io_uring_mmap for a similar purpose. > > > > But it only supports a single PFN range at a time and requires the > > > > caller to handle refcounting of pages. > > > > > > > > It's getting late in Germany, so I might be missing something; but using > > > > the shared zeropage here might be a problem. > > > > > > > > > > I was just about to write code to cleanly support vm_insert_pages() to > > > consume zeropages ... when I started to wonder about something else: > > > > Alternatively, we could at the moment allocate a meta-page of the same size than > > the subbufs? (of course the downside is to waste a bit of memory) > > Supporting the shared zeropage should be possible. We just have to be > careful that no other code can accidentially set it writable. I'll have to > further think about that (not just your user, but making it sane to use by > new code as well). > > So far, we primarily only have shared zeropages in the MAP_PRIVATE mappings. > Dax is a corner case where we have them in MAP_SHARED mappings. PFNMAP would > not be problematic, but MIXEDMAP is. (again, I am not 100% sure about > combining both ...) > > I'll further think about that one, and try crafting a reasonable patch. > > Could we start with no shared zeropage and implement that as an optimization > on top? Of course, I'm happy to update this series without the zero-page and to bring it back later, in a separate patch, as soon as vm_insert_pages() is compatible. Thanks! > > > > > > > > > > > > + if (vma->vm_flags & VM_WRITE || vma->vm_flags & VM_EXEC || > > > + !(vma->vm_flags & VM_MAYSHARE)) > > > + return -EPERM; > > > + > > > > > > You disallow writable mappings. But what about using mprotect() afterwards > > > to allow for write permissions? > > > > > > Likely you'd want to remove VM_MAYWRITE from the flags, otherwise mprotect() > > > might alter succeed. > > > > The vm_flags_mod below is clearing VM_MAYWRITE already. Isn't it enough? > > Oh, maybe I missed that! > > -- > Cheers, > > David / dhildenb >
On 22.04.24 22:31, Vincent Donnefort wrote: > On Mon, Apr 22, 2024 at 08:27:17PM +0200, David Hildenbrand wrote: >> On 22.04.24 20:20, Vincent Donnefort wrote: >>> Hi David, >>> >>> Thanks for having a look, very much appreciated! >>> >>> On Mon, Apr 22, 2024 at 11:27:11AM +0200, David Hildenbrand wrote: >>>> On 19.04.24 20:25, David Hildenbrand wrote: >>>>> On 06.04.24 19:36, Vincent Donnefort wrote: >>>>>> In preparation for allowing the user-space to map a ring-buffer, add >>>>>> a set of mapping functions: >>>>>> >>>>>> ring_buffer_{map,unmap}() >>>>>> >>>>>> And controls on the ring-buffer: >>>>>> >>>>>> ring_buffer_map_get_reader() /* swap reader and head */ >>>>>> >>>>>> Mapping the ring-buffer also involves: >>>>>> >>>>>> A unique ID for each subbuf of the ring-buffer, currently they are >>>>>> only identified through their in-kernel VA. >>>>>> >>>>>> A meta-page, where are stored ring-buffer statistics and a >>>>>> description for the current reader >>>>>> >>>>>> The linear mapping exposes the meta-page, and each subbuf of the >>>>>> ring-buffer, ordered following their unique ID, assigned during the >>>>>> first mapping. >>>>>> >>>>>> Once mapped, no subbuf can get in or out of the ring-buffer: the buffer >>>>>> size will remain unmodified and the splice enabling functions will in >>>>>> reality simply memcpy the data instead of swapping subbufs. >>>>>> >>>>>> CC: <linux-mm@kvack.org> >>>>>> Signed-off-by: Vincent Donnefort <vdonnefort@google.com> >>>>>> >>>>>> diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h >>>>>> index dc5ae4e96aee..96d2140b471e 100644 >>>>>> --- a/include/linux/ring_buffer.h >>>>>> +++ b/include/linux/ring_buffer.h >>>>>> @@ -6,6 +6,8 @@ >>>>>> #include <linux/seq_file.h> >>>>>> #include <linux/poll.h> >>>>>> +#include <uapi/linux/trace_mmap.h> >>>>>> + >>>>>> struct trace_buffer; >>>>>> struct ring_buffer_iter; >>>>>> @@ -223,4 +225,8 @@ int trace_rb_cpu_prepare(unsigned int cpu, struct hlist_node *node); >>>>>> #define trace_rb_cpu_prepare NULL >>>>>> #endif >>>>>> +int ring_buffer_map(struct trace_buffer *buffer, int cpu, >>>>>> + struct vm_area_struct *vma); >>>>>> +int ring_buffer_unmap(struct trace_buffer *buffer, int cpu); >>>>>> +int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu); >>>>>> #endif /* _LINUX_RING_BUFFER_H */ >>>>>> diff --git a/include/uapi/linux/trace_mmap.h b/include/uapi/linux/trace_mmap.h >>>>>> new file mode 100644 >>>>>> index 000000000000..ffcd8dfcaa4f >>>>>> --- /dev/null >>>>>> +++ b/include/uapi/linux/trace_mmap.h >>>>>> @@ -0,0 +1,46 @@ >>>>>> +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ >>>>>> +#ifndef _TRACE_MMAP_H_ >>>>>> +#define _TRACE_MMAP_H_ >>>>>> + >>>>>> +#include <linux/types.h> >>>>>> + >>>>>> +/** >>>>>> + * struct trace_buffer_meta - Ring-buffer Meta-page description >>>>>> + * @meta_page_size: Size of this meta-page. >>>>>> + * @meta_struct_len: Size of this structure. >>>>>> + * @subbuf_size: Size of each sub-buffer. >>>>>> + * @nr_subbufs: Number of subbfs in the ring-buffer, including the reader. >>>>>> + * @reader.lost_events: Number of events lost at the time of the reader swap. >>>>>> + * @reader.id: subbuf ID of the current reader. ID range [0 : @nr_subbufs - 1] >>>>>> + * @reader.read: Number of bytes read on the reader subbuf. >>>>>> + * @flags: Placeholder for now, 0 until new features are supported. >>>>>> + * @entries: Number of entries in the ring-buffer. >>>>>> + * @overrun: Number of entries lost in the ring-buffer. >>>>>> + * @read: Number of entries that have been read. >>>>>> + * @Reserved1: Reserved for future use. >>>>>> + * @Reserved2: Reserved for future use. >>>>>> + */ >>>>>> +struct trace_buffer_meta { >>>>>> + __u32 meta_page_size; >>>>>> + __u32 meta_struct_len; >>>>>> + >>>>>> + __u32 subbuf_size; >>>>>> + __u32 nr_subbufs; >>>>>> + >>>>>> + struct { >>>>>> + __u64 lost_events; >>>>>> + __u32 id; >>>>>> + __u32 read; >>>>>> + } reader; >>>>>> + >>>>>> + __u64 flags; >>>>>> + >>>>>> + __u64 entries; >>>>>> + __u64 overrun; >>>>>> + __u64 read; >>>>>> + >>>>>> + __u64 Reserved1; >>>>>> + __u64 Reserved2; >>>>>> +}; >>>>>> + >>>>>> +#endif /* _TRACE_MMAP_H_ */ >>>>>> diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c >>>>>> index cc9ebe593571..793ecc454039 100644 >>>>>> --- a/kernel/trace/ring_buffer.c >>>>>> +++ b/kernel/trace/ring_buffer.c >>>>>> @@ -9,6 +9,7 @@ >>>>>> #include <linux/ring_buffer.h> >>>>>> #include <linux/trace_clock.h> >>>>>> #include <linux/sched/clock.h> >>>>>> +#include <linux/cacheflush.h> >>>>>> #include <linux/trace_seq.h> >>>>>> #include <linux/spinlock.h> >>>>>> #include <linux/irq_work.h> >>>>>> @@ -26,6 +27,7 @@ >>>>>> #include <linux/list.h> >>>>>> #include <linux/cpu.h> >>>>>> #include <linux/oom.h> >>>>>> +#include <linux/mm.h> >>>>>> #include <asm/local64.h> >>>>>> #include <asm/local.h> >>>>>> @@ -338,6 +340,7 @@ struct buffer_page { >>>>>> local_t entries; /* entries on this page */ >>>>>> unsigned long real_end; /* real end of data */ >>>>>> unsigned order; /* order of the page */ >>>>>> + u32 id; /* ID for external mapping */ >>>>>> struct buffer_data_page *page; /* Actual data page */ >>>>>> }; >>>>>> @@ -484,6 +487,12 @@ struct ring_buffer_per_cpu { >>>>>> u64 read_stamp; >>>>>> /* pages removed since last reset */ >>>>>> unsigned long pages_removed; >>>>>> + >>>>>> + unsigned int mapped; >>>>>> + struct mutex mapping_lock; >>>>>> + unsigned long *subbuf_ids; /* ID to subbuf VA */ >>>>>> + struct trace_buffer_meta *meta_page; >>>>>> + >>>>>> /* ring buffer pages to update, > 0 to add, < 0 to remove */ >>>>>> long nr_pages_to_update; >>>>>> struct list_head new_pages; /* new pages to add */ >>>>>> @@ -1599,6 +1608,7 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) >>>>>> init_irq_work(&cpu_buffer->irq_work.work, rb_wake_up_waiters); >>>>>> init_waitqueue_head(&cpu_buffer->irq_work.waiters); >>>>>> init_waitqueue_head(&cpu_buffer->irq_work.full_waiters); >>>>>> + mutex_init(&cpu_buffer->mapping_lock); >>>>>> bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), >>>>>> GFP_KERNEL, cpu_to_node(cpu)); >>>>>> @@ -1789,8 +1799,6 @@ bool ring_buffer_time_stamp_abs(struct trace_buffer *buffer) >>>>>> return buffer->time_stamp_abs; >>>>>> } >>>>>> -static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer); >>>>>> - >>>>>> static inline unsigned long rb_page_entries(struct buffer_page *bpage) >>>>>> { >>>>>> return local_read(&bpage->entries) & RB_WRITE_MASK; >>>>>> @@ -5211,6 +5219,22 @@ static void rb_clear_buffer_page(struct buffer_page *page) >>>>>> page->read = 0; >>>>>> } >>>>>> +static void rb_update_meta_page(struct ring_buffer_per_cpu *cpu_buffer) >>>>>> +{ >>>>>> + struct trace_buffer_meta *meta = cpu_buffer->meta_page; >>>>>> + >>>>>> + meta->reader.read = cpu_buffer->reader_page->read; >>>>>> + meta->reader.id = cpu_buffer->reader_page->id; >>>>>> + meta->reader.lost_events = cpu_buffer->lost_events; >>>>>> + >>>>>> + meta->entries = local_read(&cpu_buffer->entries); >>>>>> + meta->overrun = local_read(&cpu_buffer->overrun); >>>>>> + meta->read = cpu_buffer->read; >>>>>> + >>>>>> + /* Some archs do not have data cache coherency between kernel and user-space */ >>>>>> + flush_dcache_folio(virt_to_folio(cpu_buffer->meta_page)); >>>>>> +} >>>>>> + >>>>>> static void >>>>>> rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) >>>>>> { >>>>>> @@ -5255,6 +5279,9 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) >>>>>> cpu_buffer->lost_events = 0; >>>>>> cpu_buffer->last_overrun = 0; >>>>>> + if (cpu_buffer->mapped) >>>>>> + rb_update_meta_page(cpu_buffer); >>>>>> + >>>>>> rb_head_page_activate(cpu_buffer); >>>>>> cpu_buffer->pages_removed = 0; >>>>>> } >>>>>> @@ -5469,6 +5496,12 @@ int ring_buffer_swap_cpu(struct trace_buffer *buffer_a, >>>>>> cpu_buffer_a = buffer_a->buffers[cpu]; >>>>>> cpu_buffer_b = buffer_b->buffers[cpu]; >>>>>> + /* It's up to the callers to not try to swap mapped buffers */ >>>>>> + if (WARN_ON_ONCE(cpu_buffer_a->mapped || cpu_buffer_b->mapped)) { >>>>>> + ret = -EBUSY; >>>>>> + goto out; >>>>>> + } >>>>>> + >>>>>> /* At least make sure the two buffers are somewhat the same */ >>>>>> if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages) >>>>>> goto out; >>>>>> @@ -5733,7 +5766,8 @@ int ring_buffer_read_page(struct trace_buffer *buffer, >>>>>> * Otherwise, we can simply swap the page with the one passed in. >>>>>> */ >>>>>> if (read || (len < (commit - read)) || >>>>>> - cpu_buffer->reader_page == cpu_buffer->commit_page) { >>>>>> + cpu_buffer->reader_page == cpu_buffer->commit_page || >>>>>> + cpu_buffer->mapped) { >>>>>> struct buffer_data_page *rpage = cpu_buffer->reader_page->page; >>>>>> unsigned int rpos = read; >>>>>> unsigned int pos = 0; >>>>>> @@ -5956,6 +5990,11 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) >>>>>> cpu_buffer = buffer->buffers[cpu]; >>>>>> + if (cpu_buffer->mapped) { >>>>>> + err = -EBUSY; >>>>>> + goto error; >>>>>> + } >>>>>> + >>>>>> /* Update the number of pages to match the new size */ >>>>>> nr_pages = old_size * buffer->buffers[cpu]->nr_pages; >>>>>> nr_pages = DIV_ROUND_UP(nr_pages, buffer->subbuf_size); >>>>>> @@ -6057,6 +6096,358 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) >>>>>> } >>>>>> EXPORT_SYMBOL_GPL(ring_buffer_subbuf_order_set); >>>>>> +static int rb_alloc_meta_page(struct ring_buffer_per_cpu *cpu_buffer) >>>>>> +{ >>>>>> + struct page *page; >>>>>> + >>>>>> + if (cpu_buffer->meta_page) >>>>>> + return 0; >>>>>> + >>>>>> + page = alloc_page(GFP_USER | __GFP_ZERO); >>>>>> + if (!page) >>>>>> + return -ENOMEM; >>>>>> + >>>>>> + cpu_buffer->meta_page = page_to_virt(page); >>>>>> + >>>>>> + return 0; >>>>>> +} >>>>>> + >>>>>> +static void rb_free_meta_page(struct ring_buffer_per_cpu *cpu_buffer) >>>>>> +{ >>>>>> + unsigned long addr = (unsigned long)cpu_buffer->meta_page; >>>>>> + >>>>>> + free_page(addr); >>>>>> + cpu_buffer->meta_page = NULL; >>>>>> +} >>>>>> + >>>>>> +static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer, >>>>>> + unsigned long *subbuf_ids) >>>>>> +{ >>>>>> + struct trace_buffer_meta *meta = cpu_buffer->meta_page; >>>>>> + unsigned int nr_subbufs = cpu_buffer->nr_pages + 1; >>>>>> + struct buffer_page *first_subbuf, *subbuf; >>>>>> + int id = 0; >>>>>> + >>>>>> + subbuf_ids[id] = (unsigned long)cpu_buffer->reader_page->page; >>>>>> + cpu_buffer->reader_page->id = id++; >>>>>> + >>>>>> + first_subbuf = subbuf = rb_set_head_page(cpu_buffer); >>>>>> + do { >>>>>> + if (WARN_ON(id >= nr_subbufs)) >>>>>> + break; >>>>>> + >>>>>> + subbuf_ids[id] = (unsigned long)subbuf->page; >>>>>> + subbuf->id = id; >>>>>> + >>>>>> + rb_inc_page(&subbuf); >>>>>> + id++; >>>>>> + } while (subbuf != first_subbuf); >>>>>> + >>>>>> + /* install subbuf ID to kern VA translation */ >>>>>> + cpu_buffer->subbuf_ids = subbuf_ids; >>>>>> + >>>>>> + /* __rb_map_vma() pads the meta-page to align it with the sub-buffers */ >>>>>> + meta->meta_page_size = PAGE_SIZE << cpu_buffer->buffer->subbuf_order; >>>>>> + meta->meta_struct_len = sizeof(*meta); >>>>>> + meta->nr_subbufs = nr_subbufs; >>>>>> + meta->subbuf_size = cpu_buffer->buffer->subbuf_size + BUF_PAGE_HDR_SIZE; >>>>>> + >>>>>> + rb_update_meta_page(cpu_buffer); >>>>>> +} >>>>>> + >>>>>> +static struct ring_buffer_per_cpu * >>>>>> +rb_get_mapped_buffer(struct trace_buffer *buffer, int cpu) >>>>>> +{ >>>>>> + struct ring_buffer_per_cpu *cpu_buffer; >>>>>> + >>>>>> + if (!cpumask_test_cpu(cpu, buffer->cpumask)) >>>>>> + return ERR_PTR(-EINVAL); >>>>>> + >>>>>> + cpu_buffer = buffer->buffers[cpu]; >>>>>> + >>>>>> + mutex_lock(&cpu_buffer->mapping_lock); >>>>>> + >>>>>> + if (!cpu_buffer->mapped) { >>>>>> + mutex_unlock(&cpu_buffer->mapping_lock); >>>>>> + return ERR_PTR(-ENODEV); >>>>>> + } >>>>>> + >>>>>> + return cpu_buffer; >>>>>> +} >>>>>> + >>>>>> +static void rb_put_mapped_buffer(struct ring_buffer_per_cpu *cpu_buffer) >>>>>> +{ >>>>>> + mutex_unlock(&cpu_buffer->mapping_lock); >>>>>> +} >>>>>> + >>>>>> +/* >>>>>> + * Fast-path for rb_buffer_(un)map(). Called whenever the meta-page doesn't need >>>>>> + * to be set-up or torn-down. >>>>>> + */ >>>>>> +static int __rb_inc_dec_mapped(struct ring_buffer_per_cpu *cpu_buffer, >>>>>> + bool inc) >>>>>> +{ >>>>>> + unsigned long flags; >>>>>> + >>>>>> + lockdep_assert_held(&cpu_buffer->mapping_lock); >>>>>> + >>>>>> + if (inc && cpu_buffer->mapped == UINT_MAX) >>>>>> + return -EBUSY; >>>>>> + >>>>>> + if (WARN_ON(!inc && cpu_buffer->mapped == 0)) >>>>>> + return -EINVAL; >>>>>> + >>>>>> + mutex_lock(&cpu_buffer->buffer->mutex); >>>>>> + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); >>>>>> + >>>>>> + if (inc) >>>>>> + cpu_buffer->mapped++; >>>>>> + else >>>>>> + cpu_buffer->mapped--; >>>>>> + >>>>>> + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); >>>>>> + mutex_unlock(&cpu_buffer->buffer->mutex); >>>>>> + >>>>>> + return 0; >>>>>> +} >>>>>> + >>>>>> +#define subbuf_page(off, start) \ >>>>>> + virt_to_page((void *)((start) + ((off) << PAGE_SHIFT))) >>>>>> + >>>>>> +#define foreach_subbuf_page(sub_order, start, page) \ >>>>>> + page = subbuf_page(0, (start)); \ >>>>>> + for (int __off = 0; __off < (1 << (sub_order)); \ >>>>>> + __off++, page = subbuf_page(__off, (start))) >>>>>> + >>>>>> +/* >>>>>> + * +--------------+ pgoff == 0 >>>>>> + * | meta page | >>>>>> + * +--------------+ pgoff == 1 >>>>>> + * | 000000000 | >>>>>> + * +--------------+ pgoff == (1 << subbuf_order) >>>>>> + * | subbuffer 0 | >>>>>> + * | | >>>>>> + * +--------------+ pgoff == (2 * (1 << subbuf_order)) >>>>>> + * | subbuffer 1 | >>>>>> + * | | >>>>>> + * ... >>>>>> + */ >>>>>> +static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer, >>>>>> + struct vm_area_struct *vma) >>>>>> +{ >>>>>> + unsigned long nr_subbufs, nr_pages, vma_pages, pgoff = vma->vm_pgoff; >>>>>> + unsigned int subbuf_pages, subbuf_order; >>>>>> + struct page **pages; >>>>>> + int p = 0, s = 0; >>>>>> + int err; >>>>>> + >>>>>> + lockdep_assert_held(&cpu_buffer->mapping_lock); >>>>>> + >>>>>> + subbuf_order = cpu_buffer->buffer->subbuf_order; >>>>>> + subbuf_pages = 1 << subbuf_order; >>>>>> + >>>>>> + if (subbuf_order && pgoff % subbuf_pages) >>>>>> + return -EINVAL; >>>>>> + >>>>>> + nr_subbufs = cpu_buffer->nr_pages + 1; >>>>>> + nr_pages = ((nr_subbufs + 1) << subbuf_order) - pgoff; >>>>>> + >>>>>> + vma_pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; >>>>>> + if (!vma_pages || vma_pages > nr_pages) >>>>>> + return -EINVAL; >>>>>> + >>>>>> + nr_pages = vma_pages; >>>>>> + >>>>>> + pages = kcalloc(nr_pages, sizeof(*pages), GFP_KERNEL); >>>>>> + if (!pages) >>>>>> + return -ENOMEM; >>>>>> + >>>>>> + if (!pgoff) { >>>>>> + unsigned long meta_page_padding; >>>>>> + >>>>>> + pages[p++] = virt_to_page(cpu_buffer->meta_page); >>>>>> + >>>>>> + /* >>>>>> + * Pad with the zero-page to align the meta-page with the >>>>>> + * sub-buffers. >>>>>> + */ >>>>>> + meta_page_padding = subbuf_pages - 1; >>>>>> + while (meta_page_padding-- && p < nr_pages) >>>>>> + pages[p++] = ZERO_PAGE(0); >>>>> >>>>> Using the shared zeropage in a MAP_SHARED mapping that is neither VM_IO >>>>> nor VM_PFNMAP can be problematic. So we really need patch #3 logic to >>>>> use VM_PFNMAP. >>>>> >>>>> It would be cleaner/more obvious if these VMA flag setting would reside >>> >>> tracing_buffers_mmap() sets both VM_IO and VM_PFNMAP. But, yeah as vma is >>> already passed to ring_buffer_map(). The flags could be set here as this >>> function is what sets the requirements really. >>> >>>>> here, if possible. >>>>> >>>>>> + } else { >>>>>> + /* Skip the meta-page */ >>>>>> + pgoff -= subbuf_pages; >>>>>> + >>>>>> + s += pgoff / subbuf_pages; >>>>>> + } >>>>>> + >>>>>> + while (s < nr_subbufs && p < nr_pages) { >>>>>> + struct page *page; >>>>>> + >>>>>> + foreach_subbuf_page(subbuf_order, cpu_buffer->subbuf_ids[s], page) { >>>>>> + if (p >= nr_pages) >>>>>> + break; >>>>>> + >>>>>> + pages[p++] = page; >>>>>> + } >>>>>> + s++; >>>>>> + } >>>>>> + >>>>>> + err = vm_insert_pages(vma, vma->vm_start, pages, &nr_pages); >>>>> >>>>> I think Linus suggested it ("avoid all the sub-page ref-counts entirely >>>>> by using VM_PFNMAP, and use vm_insert_pages()"), but ... >>>>> vm_insert_pages() will: >>>>> * Mess with mapcounts >>>>> * Mess with refcounts >>>>> >>>>> See >>>>> insert_pages()->insert_page_in_batch_locked()->insert_page_into_pte_locked(). >>>>> >>>>> So we'll mess with the mapcount and refcount of the shared zeropage ... >>>>> hmmmm >>>>> >>>>> If I am not wrong, vm_normal_page() would not return the shared zeropage >>>>> in case we don't have CONFIG_ARCH_HAS_PTE_SPECIAL ... so >>>>> unmap()->...->zap_present_ptes() would not decrement the refcount and we >>>>> could overflow it. ... we also shouldn't ever mess with the mapcount of >>>>> the shared zeropage in the first place. >>>>> >>>>> vm_insert_page() is clearer on that: "This allows drivers to insert >>>>> individual pages they've allocated into a user vma". You didn't allocate >>>>> the shared zeropage. >>>>> >>>>> I'm wondering if we even expect VM_MIXEDMAP and VM_PFNMAP to be set at >>>>> the same time? vm_insert_pages() would BUG_ON in case VM_PFNMAP is >>>>> already set and it would set VM_MIXEDMAP ... similarly >>>>> vmf_insert_pfn_prot() would not be happy about that at all ... >>>>> >>>>> BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == >>>>> (VM_PFNMAP|VM_MIXEDMAP)); >>>>> >>>>> ... remap_pfn_range() is used by io_uring_mmap for a similar purpose. >>>>> But it only supports a single PFN range at a time and requires the >>>>> caller to handle refcounting of pages. >>>>> >>>>> It's getting late in Germany, so I might be missing something; but using >>>>> the shared zeropage here might be a problem. >>>>> >>>> >>>> I was just about to write code to cleanly support vm_insert_pages() to >>>> consume zeropages ... when I started to wonder about something else: >>> >>> Alternatively, we could at the moment allocate a meta-page of the same size than >>> the subbufs? (of course the downside is to waste a bit of memory) >> >> Supporting the shared zeropage should be possible. We just have to be >> careful that no other code can accidentially set it writable. I'll have to >> further think about that (not just your user, but making it sane to use by >> new code as well). >> >> So far, we primarily only have shared zeropages in the MAP_PRIVATE mappings. >> Dax is a corner case where we have them in MAP_SHARED mappings. PFNMAP would >> not be problematic, but MIXEDMAP is. (again, I am not 100% sure about >> combining both ...) >> >> I'll further think about that one, and try crafting a reasonable patch. >> >> Could we start with no shared zeropage and implement that as an optimization >> on top? > > Of course, I'm happy to update this series without the zero-page and to bring it > back later, in a separate patch, as soon as vm_insert_pages() is compatible. > Good, let's explore that path first!
* Steven Rostedt <rostedt@goodmis.org> [240410 13:41]: > On Sat, 6 Apr 2024 18:36:46 +0100 > Vincent Donnefort <vdonnefort@google.com> wrote: > > > +int ring_buffer_map(struct trace_buffer *buffer, int cpu, > > + struct vm_area_struct *vma) > > +{ > > + struct ring_buffer_per_cpu *cpu_buffer; > > + unsigned long flags, *subbuf_ids; > > + int err = 0; > > + > > + if (!cpumask_test_cpu(cpu, buffer->cpumask)) > > + return -EINVAL; > > + > > + cpu_buffer = buffer->buffers[cpu]; > > + > > + mutex_lock(&cpu_buffer->mapping_lock); > > + > > + if (cpu_buffer->mapped) { > > + err = __rb_map_vma(cpu_buffer, vma); > > + if (!err) > > + err = __rb_inc_dec_mapped(cpu_buffer, true); > > + mutex_unlock(&cpu_buffer->mapping_lock); > > + return err; > > + } > > + > > + /* prevent another thread from changing buffer/sub-buffer sizes */ > > + mutex_lock(&buffer->mutex); > > + > > + err = rb_alloc_meta_page(cpu_buffer); > > + if (err) > > + goto unlock; > > + > > + /* subbuf_ids include the reader while nr_pages does not */ > > + subbuf_ids = kcalloc(cpu_buffer->nr_pages + 1, sizeof(*subbuf_ids), GFP_KERNEL); > > + if (!subbuf_ids) { > > + rb_free_meta_page(cpu_buffer); > > + err = -ENOMEM; > > + goto unlock; > > + } > > + > > + atomic_inc(&cpu_buffer->resize_disabled); > > + > > + /* > > + * Lock all readers to block any subbuf swap until the subbuf IDs are > > + * assigned. > > + */ > > + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); > > + rb_setup_ids_meta_page(cpu_buffer, subbuf_ids); > > + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); > > + > > + err = __rb_map_vma(cpu_buffer, vma); > > + if (!err) { > > + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); > > + cpu_buffer->mapped = 1; > > + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); > > + } else { > > + kfree(cpu_buffer->subbuf_ids); > > + cpu_buffer->subbuf_ids = NULL; > > + rb_free_meta_page(cpu_buffer); > > + } > > +unlock: > > Nit: For all labels, please add a space before them. Otherwise, diffs will > show "unlock" as the function and not "ring_buffer_map", making it harder > to find where the change is. > Isn't the inclusion of a space before labels outdate since 'git diff' superseds 'diff' and fixed this issue? Thanks, Liam
On Tue, 23 Apr 2024 12:04:15 -0400 "Liam R. Howlett" <Liam.Howlett@oracle.com> wrote: > > Nit: For all labels, please add a space before them. Otherwise, diffs will > > show "unlock" as the function and not "ring_buffer_map", making it harder > > to find where the change is. > > > > Isn't the inclusion of a space before labels outdate since 'git diff' > superseds 'diff' and fixed this issue? I still use patch and quilt ;-) and that still suffers from that on reject files (used for backporting and such). -- Steve
diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index dc5ae4e96aee..96d2140b471e 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -6,6 +6,8 @@ #include <linux/seq_file.h> #include <linux/poll.h> +#include <uapi/linux/trace_mmap.h> + struct trace_buffer; struct ring_buffer_iter; @@ -223,4 +225,8 @@ int trace_rb_cpu_prepare(unsigned int cpu, struct hlist_node *node); #define trace_rb_cpu_prepare NULL #endif +int ring_buffer_map(struct trace_buffer *buffer, int cpu, + struct vm_area_struct *vma); +int ring_buffer_unmap(struct trace_buffer *buffer, int cpu); +int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu); #endif /* _LINUX_RING_BUFFER_H */ diff --git a/include/uapi/linux/trace_mmap.h b/include/uapi/linux/trace_mmap.h new file mode 100644 index 000000000000..ffcd8dfcaa4f --- /dev/null +++ b/include/uapi/linux/trace_mmap.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _TRACE_MMAP_H_ +#define _TRACE_MMAP_H_ + +#include <linux/types.h> + +/** + * struct trace_buffer_meta - Ring-buffer Meta-page description + * @meta_page_size: Size of this meta-page. + * @meta_struct_len: Size of this structure. + * @subbuf_size: Size of each sub-buffer. + * @nr_subbufs: Number of subbfs in the ring-buffer, including the reader. + * @reader.lost_events: Number of events lost at the time of the reader swap. + * @reader.id: subbuf ID of the current reader. ID range [0 : @nr_subbufs - 1] + * @reader.read: Number of bytes read on the reader subbuf. + * @flags: Placeholder for now, 0 until new features are supported. + * @entries: Number of entries in the ring-buffer. + * @overrun: Number of entries lost in the ring-buffer. + * @read: Number of entries that have been read. + * @Reserved1: Reserved for future use. + * @Reserved2: Reserved for future use. + */ +struct trace_buffer_meta { + __u32 meta_page_size; + __u32 meta_struct_len; + + __u32 subbuf_size; + __u32 nr_subbufs; + + struct { + __u64 lost_events; + __u32 id; + __u32 read; + } reader; + + __u64 flags; + + __u64 entries; + __u64 overrun; + __u64 read; + + __u64 Reserved1; + __u64 Reserved2; +}; + +#endif /* _TRACE_MMAP_H_ */ diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index cc9ebe593571..793ecc454039 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -9,6 +9,7 @@ #include <linux/ring_buffer.h> #include <linux/trace_clock.h> #include <linux/sched/clock.h> +#include <linux/cacheflush.h> #include <linux/trace_seq.h> #include <linux/spinlock.h> #include <linux/irq_work.h> @@ -26,6 +27,7 @@ #include <linux/list.h> #include <linux/cpu.h> #include <linux/oom.h> +#include <linux/mm.h> #include <asm/local64.h> #include <asm/local.h> @@ -338,6 +340,7 @@ struct buffer_page { local_t entries; /* entries on this page */ unsigned long real_end; /* real end of data */ unsigned order; /* order of the page */ + u32 id; /* ID for external mapping */ struct buffer_data_page *page; /* Actual data page */ }; @@ -484,6 +487,12 @@ struct ring_buffer_per_cpu { u64 read_stamp; /* pages removed since last reset */ unsigned long pages_removed; + + unsigned int mapped; + struct mutex mapping_lock; + unsigned long *subbuf_ids; /* ID to subbuf VA */ + struct trace_buffer_meta *meta_page; + /* ring buffer pages to update, > 0 to add, < 0 to remove */ long nr_pages_to_update; struct list_head new_pages; /* new pages to add */ @@ -1599,6 +1608,7 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) init_irq_work(&cpu_buffer->irq_work.work, rb_wake_up_waiters); init_waitqueue_head(&cpu_buffer->irq_work.waiters); init_waitqueue_head(&cpu_buffer->irq_work.full_waiters); + mutex_init(&cpu_buffer->mapping_lock); bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), GFP_KERNEL, cpu_to_node(cpu)); @@ -1789,8 +1799,6 @@ bool ring_buffer_time_stamp_abs(struct trace_buffer *buffer) return buffer->time_stamp_abs; } -static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer); - static inline unsigned long rb_page_entries(struct buffer_page *bpage) { return local_read(&bpage->entries) & RB_WRITE_MASK; @@ -5211,6 +5219,22 @@ static void rb_clear_buffer_page(struct buffer_page *page) page->read = 0; } +static void rb_update_meta_page(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct trace_buffer_meta *meta = cpu_buffer->meta_page; + + meta->reader.read = cpu_buffer->reader_page->read; + meta->reader.id = cpu_buffer->reader_page->id; + meta->reader.lost_events = cpu_buffer->lost_events; + + meta->entries = local_read(&cpu_buffer->entries); + meta->overrun = local_read(&cpu_buffer->overrun); + meta->read = cpu_buffer->read; + + /* Some archs do not have data cache coherency between kernel and user-space */ + flush_dcache_folio(virt_to_folio(cpu_buffer->meta_page)); +} + static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) { @@ -5255,6 +5279,9 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->lost_events = 0; cpu_buffer->last_overrun = 0; + if (cpu_buffer->mapped) + rb_update_meta_page(cpu_buffer); + rb_head_page_activate(cpu_buffer); cpu_buffer->pages_removed = 0; } @@ -5469,6 +5496,12 @@ int ring_buffer_swap_cpu(struct trace_buffer *buffer_a, cpu_buffer_a = buffer_a->buffers[cpu]; cpu_buffer_b = buffer_b->buffers[cpu]; + /* It's up to the callers to not try to swap mapped buffers */ + if (WARN_ON_ONCE(cpu_buffer_a->mapped || cpu_buffer_b->mapped)) { + ret = -EBUSY; + goto out; + } + /* At least make sure the two buffers are somewhat the same */ if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages) goto out; @@ -5733,7 +5766,8 @@ int ring_buffer_read_page(struct trace_buffer *buffer, * Otherwise, we can simply swap the page with the one passed in. */ if (read || (len < (commit - read)) || - cpu_buffer->reader_page == cpu_buffer->commit_page) { + cpu_buffer->reader_page == cpu_buffer->commit_page || + cpu_buffer->mapped) { struct buffer_data_page *rpage = cpu_buffer->reader_page->page; unsigned int rpos = read; unsigned int pos = 0; @@ -5956,6 +5990,11 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) cpu_buffer = buffer->buffers[cpu]; + if (cpu_buffer->mapped) { + err = -EBUSY; + goto error; + } + /* Update the number of pages to match the new size */ nr_pages = old_size * buffer->buffers[cpu]->nr_pages; nr_pages = DIV_ROUND_UP(nr_pages, buffer->subbuf_size); @@ -6057,6 +6096,358 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) } EXPORT_SYMBOL_GPL(ring_buffer_subbuf_order_set); +static int rb_alloc_meta_page(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct page *page; + + if (cpu_buffer->meta_page) + return 0; + + page = alloc_page(GFP_USER | __GFP_ZERO); + if (!page) + return -ENOMEM; + + cpu_buffer->meta_page = page_to_virt(page); + + return 0; +} + +static void rb_free_meta_page(struct ring_buffer_per_cpu *cpu_buffer) +{ + unsigned long addr = (unsigned long)cpu_buffer->meta_page; + + free_page(addr); + cpu_buffer->meta_page = NULL; +} + +static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer, + unsigned long *subbuf_ids) +{ + struct trace_buffer_meta *meta = cpu_buffer->meta_page; + unsigned int nr_subbufs = cpu_buffer->nr_pages + 1; + struct buffer_page *first_subbuf, *subbuf; + int id = 0; + + subbuf_ids[id] = (unsigned long)cpu_buffer->reader_page->page; + cpu_buffer->reader_page->id = id++; + + first_subbuf = subbuf = rb_set_head_page(cpu_buffer); + do { + if (WARN_ON(id >= nr_subbufs)) + break; + + subbuf_ids[id] = (unsigned long)subbuf->page; + subbuf->id = id; + + rb_inc_page(&subbuf); + id++; + } while (subbuf != first_subbuf); + + /* install subbuf ID to kern VA translation */ + cpu_buffer->subbuf_ids = subbuf_ids; + + /* __rb_map_vma() pads the meta-page to align it with the sub-buffers */ + meta->meta_page_size = PAGE_SIZE << cpu_buffer->buffer->subbuf_order; + meta->meta_struct_len = sizeof(*meta); + meta->nr_subbufs = nr_subbufs; + meta->subbuf_size = cpu_buffer->buffer->subbuf_size + BUF_PAGE_HDR_SIZE; + + rb_update_meta_page(cpu_buffer); +} + +static struct ring_buffer_per_cpu * +rb_get_mapped_buffer(struct trace_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return ERR_PTR(-EINVAL); + + cpu_buffer = buffer->buffers[cpu]; + + mutex_lock(&cpu_buffer->mapping_lock); + + if (!cpu_buffer->mapped) { + mutex_unlock(&cpu_buffer->mapping_lock); + return ERR_PTR(-ENODEV); + } + + return cpu_buffer; +} + +static void rb_put_mapped_buffer(struct ring_buffer_per_cpu *cpu_buffer) +{ + mutex_unlock(&cpu_buffer->mapping_lock); +} + +/* + * Fast-path for rb_buffer_(un)map(). Called whenever the meta-page doesn't need + * to be set-up or torn-down. + */ +static int __rb_inc_dec_mapped(struct ring_buffer_per_cpu *cpu_buffer, + bool inc) +{ + unsigned long flags; + + lockdep_assert_held(&cpu_buffer->mapping_lock); + + if (inc && cpu_buffer->mapped == UINT_MAX) + return -EBUSY; + + if (WARN_ON(!inc && cpu_buffer->mapped == 0)) + return -EINVAL; + + mutex_lock(&cpu_buffer->buffer->mutex); + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + + if (inc) + cpu_buffer->mapped++; + else + cpu_buffer->mapped--; + + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + mutex_unlock(&cpu_buffer->buffer->mutex); + + return 0; +} + +#define subbuf_page(off, start) \ + virt_to_page((void *)((start) + ((off) << PAGE_SHIFT))) + +#define foreach_subbuf_page(sub_order, start, page) \ + page = subbuf_page(0, (start)); \ + for (int __off = 0; __off < (1 << (sub_order)); \ + __off++, page = subbuf_page(__off, (start))) + +/* + * +--------------+ pgoff == 0 + * | meta page | + * +--------------+ pgoff == 1 + * | 000000000 | + * +--------------+ pgoff == (1 << subbuf_order) + * | subbuffer 0 | + * | | + * +--------------+ pgoff == (2 * (1 << subbuf_order)) + * | subbuffer 1 | + * | | + * ... + */ +static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer, + struct vm_area_struct *vma) +{ + unsigned long nr_subbufs, nr_pages, vma_pages, pgoff = vma->vm_pgoff; + unsigned int subbuf_pages, subbuf_order; + struct page **pages; + int p = 0, s = 0; + int err; + + lockdep_assert_held(&cpu_buffer->mapping_lock); + + subbuf_order = cpu_buffer->buffer->subbuf_order; + subbuf_pages = 1 << subbuf_order; + + if (subbuf_order && pgoff % subbuf_pages) + return -EINVAL; + + nr_subbufs = cpu_buffer->nr_pages + 1; + nr_pages = ((nr_subbufs + 1) << subbuf_order) - pgoff; + + vma_pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + if (!vma_pages || vma_pages > nr_pages) + return -EINVAL; + + nr_pages = vma_pages; + + pages = kcalloc(nr_pages, sizeof(*pages), GFP_KERNEL); + if (!pages) + return -ENOMEM; + + if (!pgoff) { + unsigned long meta_page_padding; + + pages[p++] = virt_to_page(cpu_buffer->meta_page); + + /* + * Pad with the zero-page to align the meta-page with the + * sub-buffers. + */ + meta_page_padding = subbuf_pages - 1; + while (meta_page_padding-- && p < nr_pages) + pages[p++] = ZERO_PAGE(0); + } else { + /* Skip the meta-page */ + pgoff -= subbuf_pages; + + s += pgoff / subbuf_pages; + } + + while (s < nr_subbufs && p < nr_pages) { + struct page *page; + + foreach_subbuf_page(subbuf_order, cpu_buffer->subbuf_ids[s], page) { + if (p >= nr_pages) + break; + + pages[p++] = page; + } + s++; + } + + err = vm_insert_pages(vma, vma->vm_start, pages, &nr_pages); + + kfree(pages); + + return err; +} + +int ring_buffer_map(struct trace_buffer *buffer, int cpu, + struct vm_area_struct *vma) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long flags, *subbuf_ids; + int err = 0; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return -EINVAL; + + cpu_buffer = buffer->buffers[cpu]; + + mutex_lock(&cpu_buffer->mapping_lock); + + if (cpu_buffer->mapped) { + err = __rb_map_vma(cpu_buffer, vma); + if (!err) + err = __rb_inc_dec_mapped(cpu_buffer, true); + mutex_unlock(&cpu_buffer->mapping_lock); + return err; + } + + /* prevent another thread from changing buffer/sub-buffer sizes */ + mutex_lock(&buffer->mutex); + + err = rb_alloc_meta_page(cpu_buffer); + if (err) + goto unlock; + + /* subbuf_ids include the reader while nr_pages does not */ + subbuf_ids = kcalloc(cpu_buffer->nr_pages + 1, sizeof(*subbuf_ids), GFP_KERNEL); + if (!subbuf_ids) { + rb_free_meta_page(cpu_buffer); + err = -ENOMEM; + goto unlock; + } + + atomic_inc(&cpu_buffer->resize_disabled); + + /* + * Lock all readers to block any subbuf swap until the subbuf IDs are + * assigned. + */ + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + rb_setup_ids_meta_page(cpu_buffer, subbuf_ids); + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + + err = __rb_map_vma(cpu_buffer, vma); + if (!err) { + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + cpu_buffer->mapped = 1; + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + } else { + kfree(cpu_buffer->subbuf_ids); + cpu_buffer->subbuf_ids = NULL; + rb_free_meta_page(cpu_buffer); + } +unlock: + mutex_unlock(&buffer->mutex); + mutex_unlock(&cpu_buffer->mapping_lock); + + return err; +} + +int ring_buffer_unmap(struct trace_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long flags; + int err = 0; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return -EINVAL; + + cpu_buffer = buffer->buffers[cpu]; + + mutex_lock(&cpu_buffer->mapping_lock); + + if (!cpu_buffer->mapped) { + err = -ENODEV; + goto out; + } else if (cpu_buffer->mapped > 1) { + __rb_inc_dec_mapped(cpu_buffer, false); + goto out; + } + + mutex_lock(&buffer->mutex); + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + + cpu_buffer->mapped = 0; + + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + + kfree(cpu_buffer->subbuf_ids); + cpu_buffer->subbuf_ids = NULL; + rb_free_meta_page(cpu_buffer); + atomic_dec(&cpu_buffer->resize_disabled); + + mutex_unlock(&buffer->mutex); +out: + mutex_unlock(&cpu_buffer->mapping_lock); + + return err; +} + +int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long reader_size; + unsigned long flags; + + cpu_buffer = rb_get_mapped_buffer(buffer, cpu); + if (IS_ERR(cpu_buffer)) + return (int)PTR_ERR(cpu_buffer); + + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); +consume: + if (rb_per_cpu_empty(cpu_buffer)) + goto out; + + reader_size = rb_page_size(cpu_buffer->reader_page); + + /* + * There are data to be read on the current reader page, we can + * return to the caller. But before that, we assume the latter will read + * everything. Let's update the kernel reader accordingly. + */ + if (cpu_buffer->reader_page->read < reader_size) { + while (cpu_buffer->reader_page->read < reader_size) + rb_advance_reader(cpu_buffer); + goto out; + } + + if (WARN_ON(!rb_get_reader_page(cpu_buffer))) + goto out; + + goto consume; +out: + /* Some archs do not have data cache coherency between kernel and user-space */ + flush_dcache_folio(virt_to_folio(cpu_buffer->reader_page->page)); + + rb_update_meta_page(cpu_buffer); + + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + rb_put_mapped_buffer(cpu_buffer); + + return 0; +} + /* * We only allocate new buffers, never free them if the CPU goes down. * If we were to free the buffer, then the user would lose any trace that was in
In preparation for allowing the user-space to map a ring-buffer, add a set of mapping functions: ring_buffer_{map,unmap}() And controls on the ring-buffer: ring_buffer_map_get_reader() /* swap reader and head */ Mapping the ring-buffer also involves: A unique ID for each subbuf of the ring-buffer, currently they are only identified through their in-kernel VA. A meta-page, where are stored ring-buffer statistics and a description for the current reader The linear mapping exposes the meta-page, and each subbuf of the ring-buffer, ordered following their unique ID, assigned during the first mapping. Once mapped, no subbuf can get in or out of the ring-buffer: the buffer size will remain unmodified and the splice enabling functions will in reality simply memcpy the data instead of swapping subbufs. CC: <linux-mm@kvack.org> Signed-off-by: Vincent Donnefort <vdonnefort@google.com>