@@ -1035,10 +1035,21 @@ struct btrfs_fs_info {
#ifdef CONFIG_BTRFS_DEBUG
struct kobject *debug_kobj;
struct kobject *discard_debug_kobj;
+ struct kobject *io_accounting_debug_kobj;
struct list_head allocated_roots;
spinlock_t eb_leak_lock;
struct list_head allocated_ebs;
+
+ spinlock_t io_accounting_lock;
+ /*
+ * The IO accounting unit are all in byte, and are in logical address
+ * space, which is before the RAID/DUP mapping.
+ */
+ u64 meta_read;
+ u64 meta_write;
+ u64 data_read;
+ u64 data_write;
#endif
};
@@ -3065,6 +3065,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
INIT_LIST_HEAD(&fs_info->allocated_roots);
INIT_LIST_HEAD(&fs_info->allocated_ebs);
spin_lock_init(&fs_info->eb_leak_lock);
+ spin_lock_init(&fs_info->io_accounting_lock);
#endif
extent_map_tree_init(&fs_info->mapping_tree);
btrfs_init_block_rsv(&fs_info->global_block_rsv,
@@ -40,6 +40,7 @@
* btrfs_debug_feature_attrs /sys/fs/btrfs/debug
* btrfs_debug_mount_attrs /sys/fs/btrfs/<uuid>/debug
* discard_debug_attrs /sys/fs/btrfs/<uuid>/debug/discard
+ * io_accounting_debug_attrs /sys/fs/btrfs/<uuid>/debug/io_accounting
*/
struct btrfs_feature_attr {
@@ -616,6 +617,62 @@ static const struct attribute_group btrfs_debug_feature_attr_group = {
.attrs = btrfs_debug_feature_attrs,
};
+/* IO accounting */
+#define io_accounting_to_fs_info(_kobj) to_fs_info((_kobj)->parent->parent)
+
+#define DECLARE_IO_ACCOUNTING_OPS(name) \
+static ssize_t io_accounting_##name##_show(struct kobject *kobj, \
+ struct kobj_attribute *a, \
+ char *buf) \
+{ \
+ struct btrfs_fs_info *fs_info = io_accounting_to_fs_info(kobj); \
+ u64 result; \
+ \
+ spin_lock(&fs_info->io_accounting_lock); \
+ result = fs_info->name; \
+ spin_unlock(&fs_info->io_accounting_lock); \
+ return sysfs_emit(buf, "%llu\n", result); \
+} \
+static ssize_t io_accounting_##name##_store(struct kobject *kobj, \
+ struct kobj_attribute *a, \
+ const char *buf, \
+ size_t count) \
+{ \
+ struct btrfs_fs_info *fs_info = io_accounting_to_fs_info(kobj); \
+ u64 value; \
+ int ret; \
+ \
+ ret = kstrtoull(skip_spaces(buf), 0, &value); \
+ if (ret) \
+ return ret; \
+ spin_lock(&fs_info->io_accounting_lock); \
+ fs_info->name = value; \
+ spin_unlock(&fs_info->io_accounting_lock); \
+ return count; \
+}
+
+DECLARE_IO_ACCOUNTING_OPS(meta_read);
+DECLARE_IO_ACCOUNTING_OPS(meta_write);
+DECLARE_IO_ACCOUNTING_OPS(data_read);
+DECLARE_IO_ACCOUNTING_OPS(data_write);
+
+BTRFS_ATTR_RW(io_accounting, meta_read, io_accounting_meta_read_show,
+ io_accounting_meta_read_store);
+BTRFS_ATTR_RW(io_accounting, meta_write, io_accounting_meta_write_show,
+ io_accounting_meta_write_store);
+BTRFS_ATTR_RW(io_accounting, data_read, io_accounting_data_read_show,
+ io_accounting_data_read_store);
+BTRFS_ATTR_RW(io_accounting, data_write, io_accounting_data_write_show,
+ io_accounting_data_write_store);
+
+static const struct attribute *io_accounting_debug_attrs[] = {
+ BTRFS_ATTR_PTR(io_accounting, meta_read),
+ BTRFS_ATTR_PTR(io_accounting, meta_write),
+ BTRFS_ATTR_PTR(io_accounting, data_read),
+ BTRFS_ATTR_PTR(io_accounting, data_write),
+ NULL,
+};
+
#endif
static ssize_t btrfs_show_u64(u64 *value_ptr, spinlock_t *lock, char *buf)
@@ -1219,6 +1276,12 @@ void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info)
kobject_del(fs_info->discard_debug_kobj);
kobject_put(fs_info->discard_debug_kobj);
}
+ if (fs_info->io_accounting_debug_kobj) {
+ sysfs_remove_files(fs_info->io_accounting_debug_kobj,
+ io_accounting_debug_attrs);
+ kobject_del(fs_info->io_accounting_debug_kobj);
+ kobject_put(fs_info->io_accounting_debug_kobj);
+ }
if (fs_info->debug_kobj) {
sysfs_remove_files(fs_info->debug_kobj, btrfs_debug_mount_attrs);
kobject_del(fs_info->debug_kobj);
@@ -1804,6 +1867,20 @@ int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info)
discard_debug_attrs);
if (error)
goto failure;
+
+ /* io_accounting directory */
+ fs_info->io_accounting_debug_kobj =
+ kobject_create_and_add("io_accounting",fs_info->debug_kobj);
+ if (!fs_info->io_accounting_debug_kobj) {
+ error = -ENOMEM;
+ goto failure;
+ }
+
+ error = sysfs_create_files(fs_info->io_accounting_debug_kobj,
+ io_accounting_debug_attrs);
+ if (error)
+ goto failure;
+
#endif
error = addrm_unknown_feature_attrs(fs_info, true);
@@ -6763,6 +6763,29 @@ static void bioc_error(struct btrfs_io_context *bioc, struct bio *bio, u64 logic
}
}
+static void update_io_accounting(struct btrfs_fs_info *fs_info, struct bio *bio)
+{
+ u32 length = bio->bi_iter.bi_size;
+ bool metadata = bio->bi_opf & REQ_META;
+
+#ifdef CONFIG_BTRFS_DEBUG
+ spin_lock(&fs_info->io_accounting_lock);
+ if (bio_op(bio) == REQ_OP_READ) {
+ if (metadata)
+ fs_info->meta_read += length;
+ else
+ fs_info->data_read += length;
+ } else {
+ if (metadata)
+ fs_info->meta_write += length;
+ else
+ fs_info->data_write += length;
+ }
+ spin_unlock(&fs_info->io_accounting_lock);
+#endif
+ return;
+}
+
blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
int mirror_num)
{
@@ -6776,6 +6799,7 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
int total_devs;
struct btrfs_io_context *bioc = NULL;
+ update_io_accounting(fs_info, bio);
length = bio->bi_iter.bi_size;
map_length = length;
[BACKGROUND] There is a series of btrfs autodefrag bugs in v5.16, that would cause way more IO than previous kernels. Unfortunately there isn't any test case covering autodefrag, nor there is anyway to show the io accounting of a btrfs. [WORKAROUND] I originally want to implement a dm target to do the io accounting for all filesystems, but can not find a good enough interface (the status interface has only 3 pre-defined workload). Thus I turned to btrfs specific io accounting first. One thing specific to btrfs is its integrated volume management/RAID. Without proper profiles specification, default profile will cause metadata IO to be accounted twice (DUP profile) and only data IO is accounted correctly. So for btrfs this patch will introduce a new sysfs directory in /sys/fs/btrfs/<uuid>/debug/io_accounting/ And have the following files: - meta_read: Metadata bytes read - meta_write: Metadata bytes written - data_read: Data bytes read - data_write: Data bytes written (including both zoned append and regular write) And all these accounting is in logical address space, meaning profile will not affect the values. All those values can be reset by simply "echo 0". Signed-off-by: Qu Wenruo <wqu@suse.com> --- Reason for RFC: - (To DM guys) Is there any good way to implement a dm target to do the IO accounting? A more generic one can help more filesystems. - (To Btrfs guys) Is the sysfs interface fine? --- fs/btrfs/ctree.h | 11 +++++++ fs/btrfs/disk-io.c | 1 + fs/btrfs/sysfs.c | 77 ++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/volumes.c | 24 +++++++++++++++ 4 files changed, 113 insertions(+)