@@ -217,6 +217,7 @@ struct dm_table {
/* a list of devices used by this table */
struct list_head devices;
struct rw_semaphore devices_lock;
+ unsigned short num_devices;
/* events get handed up using this callback */
void (*event_fn)(void *data);
@@ -2133,6 +2133,8 @@ void dm_table_postsuspend_targets(struct dm_table *t)
int dm_table_resume_targets(struct dm_table *t)
{
+ struct list_head *devices = dm_table_get_devices(t);
+ struct dm_dev_internal *dd;
unsigned int i;
int r = 0;
@@ -2159,6 +2161,11 @@ int dm_table_resume_targets(struct dm_table *t)
ti->type->resume(ti);
}
+ t->num_devices = 0;
+
+ list_for_each_entry(dd, devices, list)
+ dd->dm_dev->index = ++(t->num_devices);
+
return 0;
}
@@ -48,6 +48,8 @@
*/
#define REQ_DM_POLL_LIST REQ_DRV
+#define DM_MAX_TABLE_DEVICES 1024
+
static const char *_name = DM_NAME;
static unsigned int major;
@@ -1543,10 +1545,39 @@ static unsigned int __send_duplicate_bios(struct clone_info *ci, struct dm_targe
return ret;
}
+static inline bool has_redundant_flush(struct dm_table *t,
+ unsigned long **bitmap)
+{
+ /* Add a limit here to prevent excessive memory usage for bitmaps */
+ if (t->num_devices >= DM_MAX_TABLE_DEVICES)
+ return false;
+
+ if (t->num_devices < t->num_targets) {
+ /* dm_dev's index starts from 1, so need plus 1 here */
+ *bitmap = bitmap_zalloc(min(t->num_devices + 1, DM_MAX_TABLE_DEVICES),
+ GFP_KERNEL);
+ if (*bitmap)
+ return true;
+ }
+
+ return false;
+}
+
+static int dm_get_dev_index(struct dm_target *ti, struct dm_dev *dev,
+ sector_t start, sector_t len, void *data)
+{
+ unsigned short *index = data;
+ *index = dev->index;
+ return 0;
+}
+
static void __send_empty_flush(struct clone_info *ci)
{
struct dm_table *t = ci->map;
struct bio flush_bio;
+ unsigned long *handled_map;
+ unsigned int nr_handled = 0;
+ bool check = has_redundant_flush(t, &handled_map);
/*
* Use an on-stack bio for this, it's safe since we don't
@@ -1562,17 +1593,46 @@ static void __send_empty_flush(struct clone_info *ci)
for (unsigned int i = 0; i < t->num_targets; i++) {
unsigned int bios;
+ unsigned short index = 0;
struct dm_target *ti = dm_table_get_target(t, i);
if (unlikely(ti->num_flush_bios == 0))
continue;
+ /*
+ * If the num_targets is greater than the number of block devices
+ * in the dm_table's devices list, __send_empty_flush() might
+ * invoke __send_duplicate_bios() multiple times for the same
+ * block device. This could lead to a substantial decrease in
+ * performance when num_targets significantly exceeds the number
+ * of block devices.
+ * Ensure that __send_duplicate_bios() is only called once for
+ * each block device.
+ */
+ if (check) {
+ if (nr_handled == t->num_devices)
+ break;
+
+ if (ti->type->iterate_devices)
+ ti->type->iterate_devices(ti, dm_get_dev_index, &index);
+
+ if (index > 0) {
+ if (__test_and_set_bit(index, handled_map))
+ continue;
+ else
+ nr_handled++;
+ }
+ }
+
atomic_add(ti->num_flush_bios, &ci->io->io_count);
bios = __send_duplicate_bios(ci, ti, ti->num_flush_bios,
NULL, GFP_NOWAIT);
atomic_sub(ti->num_flush_bios - bios, &ci->io->io_count);
}
+ if (check)
+ bitmap_free(handled_map);
+
/*
* alloc_io() takes one extra reference for submission, so the
* reference won't reach 0 without the following subtraction
@@ -169,6 +169,12 @@ struct dm_dev {
struct dax_device *dax_dev;
blk_mode_t mode;
char name[16];
+
+ /*
+ * sequential number for each dm_dev in dm_table's devices list,
+ * start from 1
+ */
+ unsigned short index;
};
/*
__send_empty_flush() sends empty flush bios to every target in the dm_table. However, if the num_targets exceeds the number of block devices in the dm_table's device list, it could lead to multiple invocations of __send_duplicate_bios() for the same block device. Typically, a single thread sending numerous empty flush bios to one block device is redundant, as these bios are likely to be merged by the flush state machine. In scenarios where num_targets significantly outweighs the number of block devices, such behavior may result in a noteworthy decrease in performance. This issue can be reproduced using this command line: for i in {0..1023}; do echo $((8000*$i)) 8000 linear /dev/sda2 $((16384*$i)) done | dmsetup create example With this fix, a random write with fsync workload executed with the following fio command: fio --group_reporting --name=benchmark --filename=/dev/mapper/example \ --ioengine=sync --invalidate=1 --numjobs=16 --rw=randwrite \ --blocksize=4k --size=2G --time_based --runtime=30 --fdatasync=1 results in an increase from 857 KB/s to 30.8 MB/s of the write throughput (3580% increase). Signed-off-by: Yang Yang <yang.yang@vivo.com> --- drivers/md/dm-core.h | 1 + drivers/md/dm-table.c | 7 ++++ drivers/md/dm.c | 60 +++++++++++++++++++++++++++++++++++ include/linux/device-mapper.h | 6 ++++ 4 files changed, 74 insertions(+)