diff mbox series

[2/3] btrfs: use the path with the lowest latency for RAID1 reads

Message ID f284d932a61c293ac1c350ac11730b11b651300c.1727368214.git.anand.jain@oracle.com (mailing list archive)
State New, archived
Headers show
Series raid1 balancing methods | expand

Commit Message

Anand Jain Sept. 27, 2024, 9:55 a.m. UTC
This feature aims to direct the read I/O to the device with the lowest
known latency for reading RAID1 blocks.

echo "latency" > /sys/fs/btrfs/<UUID>/read_policy

Signed-off-by: Anand Jain <anand.jain@oracle.com>
---
 fs/btrfs/sysfs.c   |  2 +-
 fs/btrfs/volumes.c | 40 ++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/volumes.h |  2 ++
 3 files changed, 43 insertions(+), 1 deletion(-)

Comments

Qu Wenruo Sept. 27, 2024, 10:25 a.m. UTC | #1
在 2024/9/27 19:25, Anand Jain 写道:
> This feature aims to direct the read I/O to the device with the lowest
> known latency for reading RAID1 blocks.
>
> echo "latency" > /sys/fs/btrfs/<UUID>/read_policy
>
> Signed-off-by: Anand Jain <anand.jain@oracle.com>
> ---
>   fs/btrfs/sysfs.c   |  2 +-
>   fs/btrfs/volumes.c | 40 ++++++++++++++++++++++++++++++++++++++++
>   fs/btrfs/volumes.h |  2 ++
>   3 files changed, 43 insertions(+), 1 deletion(-)
>
> diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
> index 18fb35a887c6..15abf931726c 100644
> --- a/fs/btrfs/sysfs.c
> +++ b/fs/btrfs/sysfs.c
> @@ -1306,7 +1306,7 @@ static ssize_t btrfs_temp_fsid_show(struct kobject *kobj,
>   BTRFS_ATTR(, temp_fsid, btrfs_temp_fsid_show);
>
>   #ifdef CONFIG_BTRFS_DEBUG
> -static const char * const btrfs_read_policy_name[] = { "pid", "rotation" };
> +static const char * const btrfs_read_policy_name[] = { "pid", "rotation", "latency" };
>   #else
>   static const char * const btrfs_read_policy_name[] = { "pid" };
>   #endif
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index c130a27386a7..20bc62d85b3b 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -12,6 +12,9 @@
>   #include <linux/uuid.h>
>   #include <linux/list_sort.h>
>   #include <linux/namei.h>
> +#ifdef CONFIG_BTRFS_DEBUG
> +#include <linux/part_stat.h>
> +#endif
>   #include "misc.h"
>   #include "ctree.h"
>   #include "disk-io.h"
> @@ -5860,6 +5863,39 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
>   }
>
>   #ifdef CONFIG_BTRFS_DEBUG
> +static int btrfs_best_stripe(struct btrfs_fs_info *fs_info,
> +			     struct btrfs_chunk_map *map, int first,
> +			     int num_stripe)
> +{
> +	u64 est_wait = 0;

Is this a typo of best_wait? Or do you mean estimated wait?

> +	int best_stripe = 0;
> +	int index;
> +
> +	for (index = first; index < first + num_stripe; index++) {
> +		u64 read_wait;
> +		u64 avg_wait = 0;
> +		unsigned long read_ios;
> +		struct btrfs_device *device = map->stripes[index].dev;
> +
> +		read_wait = part_stat_read(device->bdev, nsecs[READ]);
> +		read_ios = part_stat_read(device->bdev, ios[READ]);
> +
> +		if (read_wait && read_ios && read_wait >= read_ios)
> +			avg_wait = div_u64(read_wait, read_ios);
> +		else
> +			btrfs_debug_rl(fs_info,
> +			"devid: %llu avg_wait ZERO read_wait %llu read_ios %lu",
> +				       device->devid, read_wait, read_ios);

I do not think we need this debug messages.

The device can have no read so far.

> +
> +		if (est_wait == 0 || est_wait > avg_wait) {

You can give @est_wait a default value of U64_MAX, so that you do not
need to check for 0, just est_wait > avg_wait is enough.

Thanks,
Qu

> +			est_wait = avg_wait;
> +			best_stripe = index;
> +		}
> +	}
> +
> +	return best_stripe;
> +}
> +
>   struct stripe_mirror {
>   	u64 devid;
>   	int map;
> @@ -5940,6 +5976,10 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
>   	case BTRFS_READ_POLICY_ROTATION:
>   		preferred_mirror = btrfs_read_rotation(map, first, num_stripes);
>   		break;
> +	case BTRFS_READ_POLICY_LATENCY:
> +		preferred_mirror = btrfs_best_stripe(fs_info, map, first,
> +								num_stripes);
> +		break;
>   #endif
>   	}
>
> diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
> index 81701217dbb9..09920ef76a9b 100644
> --- a/fs/btrfs/volumes.h
> +++ b/fs/btrfs/volumes.h
> @@ -306,6 +306,8 @@ enum btrfs_read_policy {
>   #ifdef CONFIG_BTRFS_DEBUG
>   	/* Balancing raid1 reads across all striped devices */
>   	BTRFS_READ_POLICY_ROTATION,
> +	/* Use the lowest-latency device dynamically */
> +	BTRFS_READ_POLICY_LATENCY,
>   #endif
>   	BTRFS_NR_READ_POLICY,
>   };
Anand Jain Oct. 11, 2024, 1:21 a.m. UTC | #2
>> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
>> index c130a27386a7..20bc62d85b3b 100644
>> --- a/fs/btrfs/volumes.c
>> +++ b/fs/btrfs/volumes.c
>> @@ -12,6 +12,9 @@
>>   #include <linux/uuid.h>
>>   #include <linux/list_sort.h>
>>   #include <linux/namei.h>
>> +#ifdef CONFIG_BTRFS_DEBUG
>> +#include <linux/part_stat.h>
>> +#endif
>>   #include "misc.h"
>>   #include "ctree.h"
>>   #include "disk-io.h"
>> @@ -5860,6 +5863,39 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info 
>> *fs_info, u64 logical, u64 len)
>>   }
>>
>>   #ifdef CONFIG_BTRFS_DEBUG
>> +static int btrfs_best_stripe(struct btrfs_fs_info *fs_info,
>> +                 struct btrfs_chunk_map *map, int first,
>> +                 int num_stripe)
>> +{
>> +    u64 est_wait = 0;
> 
> Is this a typo of best_wait? Or do you mean estimated wait?
> 

It is best_wait. Fixed in v2.


>> +    int best_stripe = 0;
>> +    int index;
>> +
>> +    for (index = first; index < first + num_stripe; index++) {
>> +        u64 read_wait;
>> +        u64 avg_wait = 0;
>> +        unsigned long read_ios;
>> +        struct btrfs_device *device = map->stripes[index].dev;
>> +
>> +        read_wait = part_stat_read(device->bdev, nsecs[READ]);
>> +        read_ios = part_stat_read(device->bdev, ios[READ]);
>> +
>> +        if (read_wait && read_ios && read_wait >= read_ios)
>> +            avg_wait = div_u64(read_wait, read_ios);
>> +        else
>> +            btrfs_debug_rl(fs_info,
>> +            "devid: %llu avg_wait ZERO read_wait %llu read_ios %lu",
>> +                       device->devid, read_wait, read_ios);
> 
> I do not think we need this debug messages.
> 
> The device can have no read so far.
> 

Um. Yeah, we can remove it.

>> +
>> +        if (est_wait == 0 || est_wait > avg_wait) {
> 
> You can give @est_wait a default value of U64_MAX, so that you do not
> need to check for 0, just est_wait > avg_wait is enough.
> 

Fixed in v2.

Thanks for the review.

- Anand


> Thanks,
> Qu
diff mbox series

Patch

diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 18fb35a887c6..15abf931726c 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -1306,7 +1306,7 @@  static ssize_t btrfs_temp_fsid_show(struct kobject *kobj,
 BTRFS_ATTR(, temp_fsid, btrfs_temp_fsid_show);
 
 #ifdef CONFIG_BTRFS_DEBUG
-static const char * const btrfs_read_policy_name[] = { "pid", "rotation" };
+static const char * const btrfs_read_policy_name[] = { "pid", "rotation", "latency" };
 #else
 static const char * const btrfs_read_policy_name[] = { "pid" };
 #endif
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index c130a27386a7..20bc62d85b3b 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -12,6 +12,9 @@ 
 #include <linux/uuid.h>
 #include <linux/list_sort.h>
 #include <linux/namei.h>
+#ifdef CONFIG_BTRFS_DEBUG
+#include <linux/part_stat.h>
+#endif
 #include "misc.h"
 #include "ctree.h"
 #include "disk-io.h"
@@ -5860,6 +5863,39 @@  int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
 }
 
 #ifdef CONFIG_BTRFS_DEBUG
+static int btrfs_best_stripe(struct btrfs_fs_info *fs_info,
+			     struct btrfs_chunk_map *map, int first,
+			     int num_stripe)
+{
+	u64 est_wait = 0;
+	int best_stripe = 0;
+	int index;
+
+	for (index = first; index < first + num_stripe; index++) {
+		u64 read_wait;
+		u64 avg_wait = 0;
+		unsigned long read_ios;
+		struct btrfs_device *device = map->stripes[index].dev;
+
+		read_wait = part_stat_read(device->bdev, nsecs[READ]);
+		read_ios = part_stat_read(device->bdev, ios[READ]);
+
+		if (read_wait && read_ios && read_wait >= read_ios)
+			avg_wait = div_u64(read_wait, read_ios);
+		else
+			btrfs_debug_rl(fs_info,
+			"devid: %llu avg_wait ZERO read_wait %llu read_ios %lu",
+				       device->devid, read_wait, read_ios);
+
+		if (est_wait == 0 || est_wait > avg_wait) {
+			est_wait = avg_wait;
+			best_stripe = index;
+		}
+	}
+
+	return best_stripe;
+}
+
 struct stripe_mirror {
 	u64 devid;
 	int map;
@@ -5940,6 +5976,10 @@  static int find_live_mirror(struct btrfs_fs_info *fs_info,
 	case BTRFS_READ_POLICY_ROTATION:
 		preferred_mirror = btrfs_read_rotation(map, first, num_stripes);
 		break;
+	case BTRFS_READ_POLICY_LATENCY:
+		preferred_mirror = btrfs_best_stripe(fs_info, map, first,
+								num_stripes);
+		break;
 #endif
 	}
 
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 81701217dbb9..09920ef76a9b 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -306,6 +306,8 @@  enum btrfs_read_policy {
 #ifdef CONFIG_BTRFS_DEBUG
 	/* Balancing raid1 reads across all striped devices */
 	BTRFS_READ_POLICY_ROTATION,
+	/* Use the lowest-latency device dynamically */
+	BTRFS_READ_POLICY_LATENCY,
 #endif
 	BTRFS_NR_READ_POLICY,
 };