@@ -154,6 +154,14 @@ CFLAGS ?= -g -Wall
CPPFLAGS ?=
LDFLAGS ?=
+USEREVENTS_INSTALLED := $(shell if (echo "$(pound)include <linux/user_events.h>" | $(CC) -E - >/dev/null 2>&1) ; then echo 1; else echo 0 ; fi)
+export USEREVENTS_INSTALLED
+ifeq ($(USEREVENTS_INSTALLED), 1)
+CFLAGS += -DUSEREVENTS
+else
+$(warning user_events.h not installed, skipping)
+endif
+
CUNIT_INSTALLED := $(shell if (printf "$(pound)include <CUnit/Basic.h>\n void main(){CU_initialize_registry();}" | $(CC) -x c - -lcunit -o /dev/null >/dev/null 2>&1) ; then echo 1; else echo 0 ; fi)
export CUNIT_INSTALLED
@@ -119,4 +119,28 @@ int trace_rescan_events(struct tep_handle *tep,
struct tep_event *get_tep_event(struct tep_handle *tep,
const char *system, const char *name);
+/* Internal interface for ftrace user events */
+
+struct tracefs_user_event_group;
+
+struct tracefs_user_event_internal
+{
+ struct tracefs_user_event event_external;
+ int write_index;
+ int iovecs;
+ int rels;
+ int len;
+ struct tracefs_user_event_group *group;
+ struct tracefs_user_event_internal *next;
+};
+
+struct tracefs_user_event_group
+{
+ int fd;
+ int mmap_len;
+ char *mmap;
+ pthread_mutex_t lock;
+ struct tracefs_user_event_internal *events;
+};
+
#endif /* _TRACE_FS_LOCAL_H */
@@ -571,4 +571,71 @@ struct tracefs_synth *tracefs_sql(struct tep_handle *tep, const char *name,
struct tep_event *
tracefs_synth_get_event(struct tep_handle *tep, struct tracefs_synth *synth);
+/* User events */
+enum tracefs_uevent_type {
+ TRACEFS_UEVENT_END,
+ TRACEFS_UEVENT_u8,
+ TRACEFS_UEVENT_s8,
+ TRACEFS_UEVENT_u16,
+ TRACEFS_UEVENT_s16,
+ TRACEFS_UEVENT_u32,
+ TRACEFS_UEVENT_s32,
+ TRACEFS_UEVENT_u64,
+ TRACEFS_UEVENT_s64,
+ TRACEFS_UEVENT_string,
+ TRACEFS_UEVENT_struct,
+ TRACEFS_UEVENT_varray,
+ TRACEFS_UEVENT_vstring,
+};
+
+enum tracefs_uevent_flags {
+ /* None */
+ TRACEFS_UEVENT_FLAG_NONE = 0,
+
+ /* When BPF is attached, use iterator/no copy */
+ TRACEFS_UEVENT_FLAG_bpf_iter = 1 << 0,
+};
+
+struct tracefs_uevent_item {
+ /* Type of item */
+ enum tracefs_uevent_type type;
+
+ /* Length of data, optional during register */
+ int len;
+
+ union {
+ /* Used during write */
+ const void *data;
+
+ /* Used during register */
+ const char *name;
+ };
+};
+
+struct tracefs_user_event {
+ unsigned int size;
+ char *enabled;
+};
+
+struct tracefs_user_event_group;
+
+struct tracefs_user_event_group *tracefs_user_event_group_open(void);
+
+void tracefs_user_event_group_close(struct tracefs_user_event_group *group);
+
+int tracefs_user_event_delete(const char *name);
+
+struct tracefs_user_event *
+tracefs_user_event_register(struct tracefs_user_event_group *group,
+ const char *name, enum tracefs_uevent_flags flags,
+ struct tracefs_uevent_item *items);
+
+static inline bool tracefs_user_event_enabled(struct tracefs_user_event *event)
+{
+ return event && ((volatile char *)event->enabled)[0] != 0;
+}
+
+int tracefs_user_event_record(struct tracefs_user_event *event,
+ struct tracefs_uevent_item *items);
+
#endif /* _TRACE_FS_H */
@@ -14,6 +14,10 @@ OBJS += tracefs-filter.o
OBJS += tracefs-dynevents.o
OBJS += tracefs-eprobes.o
+ifeq ($(USEREVENTS_INSTALLED), 1)
+OBJS += tracefs-userevents.o
+endif
+
# Order matters for the the three below
OBJS += sqlhist-lex.o
OBJS += sqlhist.tab.o
new file mode 100644
@@ -0,0 +1,516 @@
+// SPDX-License-Identifier: LGPL-2.1
+/*
+ * Copyright (C) 2022 Microsoft Corporation.
+ *
+ * Authors:
+ * Beau Belgrave <beaub@linux.microsoft.com>
+ */
+
+#include <alloca.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <sys/mman.h>
+#include <sys/ioctl.h>
+#include <sys/uio.h>
+#include <linux/user_events.h>
+
+#include "tracefs.h"
+#include "tracefs-local.h"
+
+#define STAT_FILE "user_events_status"
+#define DATA_FILE "user_events_data"
+
+static void free_user_events(struct tracefs_user_event_internal *event)
+{
+ struct tracefs_user_event_internal *next;
+
+ while (event) {
+ next = event->next;
+ free(event);
+ event = next;
+ }
+}
+
+static int append_field(struct tracefs_uevent_item *item, struct trace_seq *seq,
+ int index)
+{
+ if (index != 0)
+ trace_seq_printf(seq, ";");
+
+ switch (item->type) {
+ case TRACEFS_UEVENT_u8:
+ trace_seq_printf(seq, " u8 %s", item->name);
+ break;
+
+ case TRACEFS_UEVENT_s8:
+ trace_seq_printf(seq, " s8 %s", item->name);
+ break;
+
+ case TRACEFS_UEVENT_u16:
+ trace_seq_printf(seq, " u16 %s", item->name);
+ break;
+
+ case TRACEFS_UEVENT_s16:
+ trace_seq_printf(seq, " s16 %s", item->name);
+ break;
+
+ case TRACEFS_UEVENT_u32:
+ trace_seq_printf(seq, " u32 %s", item->name);
+ break;
+
+ case TRACEFS_UEVENT_s32:
+ trace_seq_printf(seq, " s32 %s", item->name);
+ break;
+
+ case TRACEFS_UEVENT_u64:
+ trace_seq_printf(seq, " u64 %s", item->name);
+ break;
+
+ case TRACEFS_UEVENT_s64:
+ trace_seq_printf(seq, " s64 %s", item->name);
+ break;
+
+ case TRACEFS_UEVENT_string:
+ if (item->len <= 0) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ trace_seq_printf(seq, " char[%d] %s", item->len, item->name);
+ break;
+
+ case TRACEFS_UEVENT_struct:
+ /*
+ * struct must have 2 strings, do simple check
+ * in user, kernel will fully validate
+ */
+ if (!strchr(item->name, ' ')) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ if (item->len <= 0) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ trace_seq_printf(seq, " struct %s %d", item->name, item->len);
+ break;
+
+ case TRACEFS_UEVENT_varray:
+ /* Variable length array */
+ trace_seq_printf(seq, " __rel_loc u8[] %s", item->name);
+ break;
+
+ case TRACEFS_UEVENT_vstring:
+ /* Variable length string */
+ trace_seq_printf(seq, " __rel_loc char[] %s", item->name);
+ break;
+
+ default:
+ /* Unknown */
+ errno = ENOENT;
+ return -1;
+ }
+
+ return 0;
+}
+
+static int create_reg_cmd(const char *name, enum tracefs_uevent_flags flags,
+ struct tracefs_uevent_item *item, struct trace_seq *seq)
+{
+ int ret, index = 0;
+
+ trace_seq_printf(seq, "%s", name);
+
+ if (flags & TRACEFS_UEVENT_FLAG_bpf_iter)
+ trace_seq_printf(seq, ":BPF_ITER");
+
+ while (item->type != TRACEFS_UEVENT_END) {
+ ret = append_field(item, seq, index++);
+
+ if (ret < 0)
+ return ret;
+
+ item++;
+ }
+
+ trace_seq_terminate(seq);
+
+ if (seq->state) {
+ errno = ENOMEM;
+ return -1;
+ }
+
+ return 0;
+}
+
+static int get_write_counts(struct tracefs_user_event_internal *event,
+ struct tracefs_uevent_item *item)
+{
+ event->rels = 0;
+ event->len = 0;
+
+ /* Start at 1, need iovec for write_index */
+ event->iovecs = 1;
+
+ while (item->type != TRACEFS_UEVENT_END) {
+ switch (item->type) {
+ case TRACEFS_UEVENT_u8:
+ case TRACEFS_UEVENT_s8:
+ event->len += sizeof(__u8);
+ break;
+
+ case TRACEFS_UEVENT_u16:
+ case TRACEFS_UEVENT_s16:
+ event->len += sizeof(__u16);
+ break;
+
+ case TRACEFS_UEVENT_u32:
+ case TRACEFS_UEVENT_s32:
+ event->len += sizeof(__u32);
+ break;
+
+ case TRACEFS_UEVENT_u64:
+ case TRACEFS_UEVENT_s64:
+ event->len += sizeof(__u64);
+ break;
+
+ case TRACEFS_UEVENT_string:
+ case TRACEFS_UEVENT_struct:
+ event->len += item->len;
+ break;
+
+ case TRACEFS_UEVENT_varray:
+ case TRACEFS_UEVENT_vstring:
+ /* Requires a rel loc entry */
+ event->len += sizeof(__u32);
+ event->rels++;
+ break;
+
+ default:
+ /* Unknown */
+ errno = ENOENT;
+ return -1;
+ }
+
+ event->iovecs++;
+ item++;
+ }
+
+ return 0;
+}
+
+/**
+ * tracefs_user_event_group_open - Opens a new group to use for user events
+ *
+ * Returns a pointer to a group to use for user events. The pointer is valid
+ * until tracefs_user_event_group_close() is called. In case of an error NULL
+ * is returned.
+ */
+struct tracefs_user_event_group *tracefs_user_event_group_open(void)
+{
+ int stat, write, page_size, i;
+ struct tracefs_user_event_group *group;
+
+ stat = tracefs_instance_file_open(NULL, STAT_FILE, O_RDWR);
+
+ if (stat < 0)
+ return NULL;
+
+ write = tracefs_instance_file_open(NULL, DATA_FILE, O_RDWR);
+
+ if (write < 0)
+ goto put_stat;
+
+ group = malloc(sizeof(*group));
+
+ if (!group)
+ goto put_write;
+
+ if (pthread_mutex_init(&group->lock, NULL) < 0)
+ goto put_group;
+
+ /* Scale up to 16-bit max user events a page at a time */
+ page_size = sysconf(_SC_PAGESIZE);
+ group->mmap_len = page_size;
+
+ for (i = 0; i < 16; ++i) {
+ group->mmap = mmap(NULL, group->mmap_len,
+ PROT_READ, MAP_SHARED, stat, 0);
+
+ if (group->mmap == MAP_FAILED && errno == EINVAL) {
+ /* Increase by page size and try again */
+ group->mmap_len += page_size;
+ continue;
+ }
+
+ break;
+ }
+
+ if (group->mmap == MAP_FAILED)
+ goto put_group;
+
+ group->fd = write;
+ group->events = NULL;
+
+ /* Status fd no longer needed */
+ close(stat);
+
+ return group;
+
+put_group:
+ free(group);
+put_write:
+ close(write);
+put_stat:
+ close(stat);
+
+ return NULL;
+}
+
+/**
+ * tracefs_user_event_delete - Deletes a user event from the system
+ * @name: Name of the event to delete
+ *
+ * Deletes the event from the system if it is not used.
+ */
+int tracefs_user_event_delete(const char *name)
+{
+ int ret, write;
+
+ write = tracefs_instance_file_open(NULL, DATA_FILE, O_RDWR);
+
+ if (write < 0)
+ return write;
+
+ ret = ioctl(write, DIAG_IOCSDEL, name);
+
+ close(write);
+
+ return ret;
+}
+
+/**
+ * tracefs_user_event_group_close - Closes a group containing user events
+ * @group: Group to close
+ *
+ * Closes a group and all the user events within it. Any user event that has
+ * been added to the group is no longer valid and cannot be used.
+ */
+void tracefs_user_event_group_close(struct tracefs_user_event_group *group)
+{
+ if (!group)
+ return;
+
+ if (group->mmap != MAP_FAILED)
+ munmap(group->mmap, group->mmap_len);
+
+ if (group->fd != -1)
+ close(group->fd);
+
+ free_user_events(group->events);
+ free(group);
+}
+
+/**
+ * tracefs_user_event_register - Registers a user event with the system
+ * @group: Group to add the user event to
+ * @name: Name of the event to register
+ * @flags: Flags to use
+ * @items: Array of items that the event contains
+ *
+ * Allocates and registers a user event with the system. The user event will be
+ * added to the @group. The lifetime of the event is bound to the @group. When
+ * the @group is closed via tracefs_user_event_group_close() the event will no
+ * longer exist and should not be used.
+ *
+ * The @items are processed in order and the final item type must be set to
+ * TRACEFS_UEVENT_END to mark the last item. Each item must have the type
+ * and name defined. The string and struct type also require the len to be set
+ * for the item.
+ *
+ * Return a pointer to a user event on success, or NULL or error.
+ *
+ * errno will be set to EINVAL if @group is null or unexpected @items.
+ */
+struct tracefs_user_event *
+tracefs_user_event_register(struct tracefs_user_event_group *group,
+ const char *name, enum tracefs_uevent_flags flags,
+ struct tracefs_uevent_item *items)
+{
+ struct tracefs_user_event_internal *event = NULL;
+ struct user_reg reg = {0};
+ struct trace_seq seq;
+
+ if (!group || !items) {
+ errno = EINVAL;
+ return NULL;
+ }
+
+ trace_seq_init(&seq);
+
+ /* Populate cmd */
+ if (create_reg_cmd(name, flags, items, &seq))
+ return NULL;
+
+ event = malloc(sizeof(*event));
+
+ if (!event)
+ goto put_seq;
+
+ reg.size = sizeof(reg);
+ reg.name_args = (__u64)seq.buffer;
+
+ /* Register event with kernel */
+ if (ioctl(group->fd, DIAG_IOCSREG, ®) == -1)
+ goto put_event;
+
+ /* Sanity check bounds returned */
+ if (reg.status_index >= group->mmap_len) {
+ errno = EINVAL;
+ goto put_event;
+ }
+
+ if (get_write_counts(event, items))
+ goto put_event;
+
+ /* Keep track of user view at this point in time */
+ event->event_external.size = sizeof(event->event_external);
+ event->event_external.enabled = &group->mmap[reg.status_index];
+
+ event->write_index = reg.write_index;
+ event->group = group;
+
+ /* Add event into the group under lock */
+ pthread_mutex_lock(&group->lock);
+ event->next = group->events;
+ group->events = event->next;
+ pthread_mutex_unlock(&group->lock);
+
+ trace_seq_destroy(&seq);
+
+ return &event->event_external;
+put_event:
+ free(event);
+put_seq:
+ trace_seq_destroy(&seq);
+
+ return NULL;
+}
+
+/**
+ * tracefs_user_event_record - Records an event with data
+ * @event: User event to record data about
+ * @items: Items to write for the event
+ *
+ * Records items for the event. Callers should check if the cost of recording
+ * should be performed by calling tracefs_user_event_enabled(). Items are
+ * checked to ensure they fit within the described items during register. Each
+ * item must specify the length of the item being recorded.
+ *
+ * Return the number of bytes recorded or -1 upon error.
+ *
+ * errno will be set to EINVAL if @event or @items is null or @items contains
+ * an item with a length of less than or equal to 0.
+ * errno will be set to E2BIG if @items contains more items than previously
+ * registered for the event.
+ */
+int tracefs_user_event_record(struct tracefs_user_event *event,
+ struct tracefs_uevent_item *items)
+{
+ struct tracefs_user_event_internal *e;
+ struct iovec *head, *io, *relio, *io_end;
+ __u32 *rel, *rel_end;
+ int len, rel_offset, data_offset, used;
+
+ if (!event || !items) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ e = (struct tracefs_user_event_internal *)event;
+ head = io = alloca(sizeof(*io) * (e->iovecs + e->rels));
+ rel = alloca(sizeof(*rel) * e->rels);
+
+ io_end = head + (e->iovecs + e->rels);
+ rel_end = rel + e->rels;
+
+ /* Relative offset starts at end of static data */
+ relio = io + e->iovecs;
+ rel_offset = e->len;
+ data_offset = 0;
+
+ /* Write index must be first */
+ io->iov_base = &e->write_index;
+ io->iov_len = sizeof(e->write_index);
+ io++;
+ used = 1;
+
+ while (items->type != TRACEFS_UEVENT_END) {
+ len = items->len;
+
+ if (len <= 0)
+ goto bad_length;
+
+ if (io >= io_end)
+ goto bad_count;
+
+ switch (items->type) {
+ case TRACEFS_UEVENT_varray:
+ case TRACEFS_UEVENT_vstring:
+ /* Dual vectors */
+ used += 2;
+
+ if (rel >= rel_end || relio >= io_end)
+ goto bad_count;
+
+ /* __rel_loc types */
+ relio->iov_base = (void *)items->data;
+ relio->iov_len = len;
+ relio++;
+
+ io->iov_base = (void *)rel;
+ io->iov_len = sizeof(*rel);
+ io++;
+ rel_offset -= sizeof(*rel);
+
+ /* Fill in rel loc data */
+ *rel = DYN_LOC(rel_offset + data_offset, len);
+ data_offset += len;
+ rel++;
+
+ break;
+
+ default:
+ /* Single vector */
+ used++;
+
+ /* Direct types */
+ io->iov_base = (void *)items->data;
+ io->iov_len = len;
+ io++;
+ rel_offset -= len;
+
+ break;
+ }
+
+ items++;
+ }
+
+ return writev(e->group->fd, head, used);
+
+bad_length:
+ fprintf(stderr, "Bad user_event item length at index %d\n",
+ used - 1);
+ errno = EINVAL;
+ return -1;
+
+bad_count:
+ fprintf(stderr, "Too many user_event items passed\n");
+ errno = E2BIG;
+ return -1;
+}
The user events are scheduled to be included into Linux 5.18, which register a special mmapped page to denote when the user event is enabled (from an external source). This API adds a wrapper to the kernel interface that makes it easy to register user events and test if they are enabled and to record the event when it is. Link: https://lore.kernel.org/linux-trace-devel/20220121192833.GA3128@kbox/T/#m2bcf53c373fbeaba2c46d1a053b3174171167e4e Signed-off-by: Beau Belgrave <beaub@linux.microsoft.com> --- Makefile | 8 + include/tracefs-local.h | 24 ++ include/tracefs.h | 67 +++++ src/Makefile | 4 + src/tracefs-userevents.c | 516 +++++++++++++++++++++++++++++++++++++++ 5 files changed, 619 insertions(+) create mode 100644 src/tracefs-userevents.c