diff mbox series

[5/8] libmultipath: Only set max_sectors_kb on map creation

Message ID 20240417184644.6193-6-mwilck@suse.com (mailing list archive)
State Not Applicable, archived
Delegated to: christophe varoqui
Headers show
Series multipath-tools: max_sectors_kb rework | expand

Commit Message

Martin Wilck April 17, 2024, 6:46 p.m. UTC
Changing max_sectors_kb on a live map is dangerous. When I/O occurs while the
max_sectors_kb value of a map is larger than that of any of its paths, I/O
errors like this may result:

  blk_insert_cloned_request: over max size limit. (127 > 126)

This situation must be strictly avoided. The kernel makes sure that the
map's limits match those of the paths when the map is created. But setting
max_sectors_kb on path devices before reloading is dangerous, even if we
read the value from the map's max_sectors_kb beforehand. The reason for
this is that the sysfs value max_sectors_kb is the kernel's max_sectors
divided by 2, and user space has no way to figure out if the kernel-internal
value is odd or even. Thus by writing back the value just read to
max_sectors_kb, one might actually decrease the kernel value by one.

The only safe way to set max_sectors_kb on a live map would be to suspend
it, flushing all outstanding IO, then the path max_sectors_kb, reload,
and resume.

Since commit 8fd4868 ("libmultipath: don't set max_sectors_kb on reloads"),
we don't set the configured max_sectors_kb any more. But as shown above,
this is not safe. So really only set max_sectors_kb when creating a map.

Users who have stacked block devices on top of multipath, as described in the
commit message of 8fd4868 must make sure that the max_sectors_kb setting is
correct when the multipath map is first created. Decreasing the map's
max_sectors_kb value (without touching the paths) should actually be possible
in this situation, because stacked block devices are usually bio-based, and
bio-based IO (in contrast to request-based) can be split if the lower-level
device has can't handle the size of the I/O.

Signed-off-by: Martin Wilck <mwilck@suse.com>
---
 libmultipath/configure.c | 38 ++++----------------------------------
 1 file changed, 4 insertions(+), 34 deletions(-)
diff mbox series

Patch

diff --git a/libmultipath/configure.c b/libmultipath/configure.c
index 13602f3..8cbb2a8 100644
--- a/libmultipath/configure.c
+++ b/libmultipath/configure.c
@@ -566,46 +566,18 @@  trigger_paths_udev_change(struct multipath *mpp, bool is_mpath)
 	mpp->needs_paths_uevent = 0;
 }
 
-static int
-sysfs_set_max_sectors_kb(struct multipath *mpp, int is_reload)
+static int sysfs_set_max_sectors_kb(struct multipath *mpp)
 {
 	struct pathgroup * pgp;
 	struct path *pp;
 	char buff[11];
 	ssize_t len;
 	int i, j, ret, err = 0;
-	struct udev_device *udd;
-	int max_sectors_kb;
 
 	if (mpp->max_sectors_kb == MAX_SECTORS_KB_UNDEF)
 		return 0;
-	max_sectors_kb = mpp->max_sectors_kb;
-	if (is_reload) {
-		if (!has_dm_info(mpp) &&
-		    dm_get_info(mpp->alias, &mpp->dmi) != 0) {
-			condlog(1, "failed to get dm info for %s", mpp->alias);
-			return 1;
-		}
-		udd = get_udev_for_mpp(mpp);
-		if (!udd) {
-			condlog(1, "failed to get udev device to set max_sectors_kb for %s", mpp->alias);
-			return 1;
-		}
-		ret = sysfs_attr_get_value(udd, "queue/max_sectors_kb", buff,
-					   sizeof(buff));
-		udev_device_unref(udd);
-		if (!sysfs_attr_value_ok(ret, sizeof(buff))) {
-			condlog(1, "failed to get current max_sectors_kb from %s", mpp->alias);
-			return 1;
-		}
-		if (sscanf(buff, "%u\n", &max_sectors_kb) != 1) {
-			condlog(1, "can't parse current max_sectors_kb from %s",
-				mpp->alias);
-			return 1;
-		}
-	}
-	snprintf(buff, 11, "%d", max_sectors_kb);
-	len = strlen(buff);
+
+	len = snprintf(buff, sizeof(buff), "%d", mpp->max_sectors_kb);
 
 	vector_foreach_slot (mpp->pg, pgp, i) {
 		vector_foreach_slot(pgp->paths, pp, j) {
@@ -923,7 +895,7 @@  int domap(struct multipath *mpp, char *params, int is_daemon)
 			return DOMAP_RETRY;
 		}
 
-		sysfs_set_max_sectors_kb(mpp, 0);
+		sysfs_set_max_sectors_kb(mpp);
 		if (is_daemon && mpp->ghost_delay > 0 && count_active_paths(mpp) &&
 		    pathcount(mpp, PATH_UP) == 0)
 			mpp->ghost_delay_tick = mpp->ghost_delay;
@@ -934,7 +906,6 @@  int domap(struct multipath *mpp, char *params, int is_daemon)
 
 	case ACT_RELOAD:
 	case ACT_RELOAD_RENAME:
-		sysfs_set_max_sectors_kb(mpp, 1);
 		if (mpp->ghost_delay_tick > 0 && pathcount(mpp, PATH_UP))
 			mpp->ghost_delay_tick = 0;
 		r = dm_addmap_reload(mpp, params, 0);
@@ -942,7 +913,6 @@  int domap(struct multipath *mpp, char *params, int is_daemon)
 
 	case ACT_RESIZE:
 	case ACT_RESIZE_RENAME:
-		sysfs_set_max_sectors_kb(mpp, 1);
 		if (mpp->ghost_delay_tick > 0 && pathcount(mpp, PATH_UP))
 			mpp->ghost_delay_tick = 0;
 		r = dm_addmap_reload(mpp, params, 1);