diff mbox series

[v4,5/5] md/raid10: Atomic write support

Message ID 20241112124256.4106435-6-john.g.garry@oracle.com (mailing list archive)
State Superseded
Headers show
Series RAID 0/1/10 atomic write support | expand

Checks

Context Check Description
mdraidci/vmtest-md-6_13-PR fail PR summary
mdraidci/vmtest-md-6_13-VM_Test-0 fail Logs for per-patch-testing

Commit Message

John Garry Nov. 12, 2024, 12:42 p.m. UTC
Set BLK_FEAT_ATOMIC_WRITES_STACKED to enable atomic writes.

For an attempt to atomic write to a region which has bad blocks, error
the write as we just cannot do this. It is unlikely to find devices which
support atomic writes and bad blocks.

Signed-off-by: John Garry <john.g.garry@oracle.com>
---
 drivers/md/raid10.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

Comments

Song Liu Nov. 15, 2024, 6:19 p.m. UTC | #1
On Tue, Nov 12, 2024 at 4:43 AM John Garry <john.g.garry@oracle.com> wrote:
>
> Set BLK_FEAT_ATOMIC_WRITES_STACKED to enable atomic writes.
>
> For an attempt to atomic write to a region which has bad blocks, error
> the write as we just cannot do this. It is unlikely to find devices which
> support atomic writes and bad blocks.
>
> Signed-off-by: John Garry <john.g.garry@oracle.com>
> ---
>  drivers/md/raid10.c | 14 ++++++++++++--
>  1 file changed, 12 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
> index 8c7f5daa073a..a3936a67e1e8 100644
> --- a/drivers/md/raid10.c
> +++ b/drivers/md/raid10.c
> @@ -1255,6 +1255,7 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
>         const enum req_op op = bio_op(bio);
>         const blk_opf_t do_sync = bio->bi_opf & REQ_SYNC;
>         const blk_opf_t do_fua = bio->bi_opf & REQ_FUA;
> +       const blk_opf_t do_atomic = bio->bi_opf & REQ_ATOMIC;
>         unsigned long flags;
>         struct r10conf *conf = mddev->private;
>         struct md_rdev *rdev;
> @@ -1273,7 +1274,7 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
>         mbio->bi_iter.bi_sector = (r10_bio->devs[n_copy].addr +
>                                    choose_data_offset(r10_bio, rdev));
>         mbio->bi_end_io = raid10_end_write_request;
> -       mbio->bi_opf = op | do_sync | do_fua;
> +       mbio->bi_opf = op | do_sync | do_fua | do_atomic;
>         if (!replacement && test_bit(FailFast,
>                                      &conf->mirrors[devnum].rdev->flags)
>                          && enough(conf, devnum))
> @@ -1468,7 +1469,15 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
>                                 continue;
>                         }
>                         if (is_bad) {
> -                               int good_sectors = first_bad - dev_sector;
> +                               int good_sectors;
> +
> +                               if (bio->bi_opf & REQ_ATOMIC) {
> +                                       /* We just cannot atomically write this ... */
> +                                       error = -EFAULT;

Is EFAULT the right error code here? I think we should return something
covered by blk_errors?

Other than this, 4/5 and 5/5 look good to me.

Thanks,
Song
Yu Kuai Nov. 16, 2024, 3:50 a.m. UTC | #2
Hi,

在 2024/11/16 2:19, Song Liu 写道:
> On Tue, Nov 12, 2024 at 4:43 AM John Garry <john.g.garry@oracle.com> wrote:
>>
>> Set BLK_FEAT_ATOMIC_WRITES_STACKED to enable atomic writes.
>>
>> For an attempt to atomic write to a region which has bad blocks, error
>> the write as we just cannot do this. It is unlikely to find devices which
>> support atomic writes and bad blocks.
>>
>> Signed-off-by: John Garry <john.g.garry@oracle.com>
>> ---
>>   drivers/md/raid10.c | 14 ++++++++++++--
>>   1 file changed, 12 insertions(+), 2 deletions(-)
>>

Reviewed-by: Yu Kuai <yukuai3@huawei.com>

One nit below:
>> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
>> index 8c7f5daa073a..a3936a67e1e8 100644
>> --- a/drivers/md/raid10.c
>> +++ b/drivers/md/raid10.c
>> @@ -1255,6 +1255,7 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
>>          const enum req_op op = bio_op(bio);
>>          const blk_opf_t do_sync = bio->bi_opf & REQ_SYNC;
>>          const blk_opf_t do_fua = bio->bi_opf & REQ_FUA;
>> +       const blk_opf_t do_atomic = bio->bi_opf & REQ_ATOMIC;
>>          unsigned long flags;
>>          struct r10conf *conf = mddev->private;
>>          struct md_rdev *rdev;
>> @@ -1273,7 +1274,7 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
>>          mbio->bi_iter.bi_sector = (r10_bio->devs[n_copy].addr +
>>                                     choose_data_offset(r10_bio, rdev));
>>          mbio->bi_end_io = raid10_end_write_request;
>> -       mbio->bi_opf = op | do_sync | do_fua;
>> +       mbio->bi_opf = op | do_sync | do_fua | do_atomic;
>>          if (!replacement && test_bit(FailFast,
>>                                       &conf->mirrors[devnum].rdev->flags)
>>                           && enough(conf, devnum))
>> @@ -1468,7 +1469,15 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
>>                                  continue;
>>                          }
>>                          if (is_bad) {
>> -                               int good_sectors = first_bad - dev_sector;
>> +                               int good_sectors;
>> +
>> +                               if (bio->bi_opf & REQ_ATOMIC) {
>> +                                       /* We just cannot atomically write this ... */

Maybe mention that we can if there is at least one disk without any BB,
it's just benefit does not worth the complexity. And return the special
error code to let user retry without atomic write.

>> +                                       error = -EFAULT;
> 
> Is EFAULT the right error code here? I think we should return something
> covered by blk_errors?

The error code is passed to bio by:

bio->bi_status = errno_to_blk_status(error);

So, this is fine.
> 
> Other than this, 4/5 and 5/5 look good to me.
> 
> Thanks,
> Song
> 
> .
>
John Garry Nov. 17, 2024, 6:32 p.m. UTC | #3
On 16/11/2024 03:50, Yu Kuai wrote:
> Hi,
> 
> 在 2024/11/16 2:19, Song Liu 写道:
>> On Tue, Nov 12, 2024 at 4:43 AM John Garry <john.g.garry@oracle.com> 
>> wrote:
>>>
>>> Set BLK_FEAT_ATOMIC_WRITES_STACKED to enable atomic writes.
>>>
>>> For an attempt to atomic write to a region which has bad blocks, error
>>> the write as we just cannot do this. It is unlikely to find devices 
>>> which
>>> support atomic writes and bad blocks.
>>>
>>> Signed-off-by: John Garry <john.g.garry@oracle.com>
>>> ---
>>>   drivers/md/raid10.c | 14 ++++++++++++--
>>>   1 file changed, 12 insertions(+), 2 deletions(-)
>>>
> 
> Reviewed-by: Yu Kuai <yukuai3@huawei.com>
> 
> One nit below:
>>> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
>>> index 8c7f5daa073a..a3936a67e1e8 100644
>>> --- a/drivers/md/raid10.c
>>> +++ b/drivers/md/raid10.c
>>> @@ -1255,6 +1255,7 @@ static void raid10_write_one_disk(struct mddev 
>>> *mddev, struct r10bio *r10_bio,
>>>          const enum req_op op = bio_op(bio);
>>>          const blk_opf_t do_sync = bio->bi_opf & REQ_SYNC;
>>>          const blk_opf_t do_fua = bio->bi_opf & REQ_FUA;
>>> +       const blk_opf_t do_atomic = bio->bi_opf & REQ_ATOMIC;
>>>          unsigned long flags;
>>>          struct r10conf *conf = mddev->private;
>>>          struct md_rdev *rdev;
>>> @@ -1273,7 +1274,7 @@ static void raid10_write_one_disk(struct mddev 
>>> *mddev, struct r10bio *r10_bio,
>>>          mbio->bi_iter.bi_sector = (r10_bio->devs[n_copy].addr +
>>>                                     choose_data_offset(r10_bio, rdev));
>>>          mbio->bi_end_io = raid10_end_write_request;
>>> -       mbio->bi_opf = op | do_sync | do_fua;
>>> +       mbio->bi_opf = op | do_sync | do_fua | do_atomic;
>>>          if (!replacement && test_bit(FailFast,
>>>                                       &conf->mirrors[devnum].rdev- 
>>> >flags)
>>>                           && enough(conf, devnum))
>>> @@ -1468,7 +1469,15 @@ static void raid10_write_request(struct mddev 
>>> *mddev, struct bio *bio,
>>>                                  continue;
>>>                          }
>>>                          if (is_bad) {
>>> -                               int good_sectors = first_bad - 
>>> dev_sector;
>>> +                               int good_sectors;
>>> +
>>> +                               if (bio->bi_opf & REQ_ATOMIC) {
>>> +                                       /* We just cannot atomically 
>>> write this ... */
> 
> Maybe mention that we can if there is at least one disk without any BB,
> it's just benefit does not worth the complexity. And return the special
> error code to let user retry without atomic write.

ok

> 
>>> +                                       error = -EFAULT;
>>
>> Is EFAULT the right error code here? I think we should return something
>> covered by blk_errors?

sure, so maybe explicitly use BLK_STS_IOERR / EIO, which is what we 
generally use in raid drivers when we cannot read/write - ok?

> 
> The error code is passed to bio by:
> 
> bio->bi_status = errno_to_blk_status(error);
> 
> So, this is fine.
>>
>> Other than this, 4/5 and 5/5 look good to me.
>>

Thanks,
John
diff mbox series

Patch

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 8c7f5daa073a..a3936a67e1e8 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1255,6 +1255,7 @@  static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
 	const enum req_op op = bio_op(bio);
 	const blk_opf_t do_sync = bio->bi_opf & REQ_SYNC;
 	const blk_opf_t do_fua = bio->bi_opf & REQ_FUA;
+	const blk_opf_t do_atomic = bio->bi_opf & REQ_ATOMIC;
 	unsigned long flags;
 	struct r10conf *conf = mddev->private;
 	struct md_rdev *rdev;
@@ -1273,7 +1274,7 @@  static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
 	mbio->bi_iter.bi_sector	= (r10_bio->devs[n_copy].addr +
 				   choose_data_offset(r10_bio, rdev));
 	mbio->bi_end_io	= raid10_end_write_request;
-	mbio->bi_opf = op | do_sync | do_fua;
+	mbio->bi_opf = op | do_sync | do_fua | do_atomic;
 	if (!replacement && test_bit(FailFast,
 				     &conf->mirrors[devnum].rdev->flags)
 			 && enough(conf, devnum))
@@ -1468,7 +1469,15 @@  static void raid10_write_request(struct mddev *mddev, struct bio *bio,
 				continue;
 			}
 			if (is_bad) {
-				int good_sectors = first_bad - dev_sector;
+				int good_sectors;
+
+				if (bio->bi_opf & REQ_ATOMIC) {
+					/* We just cannot atomically write this ... */
+					error = -EFAULT;
+					goto err_handle;
+				}
+
+				good_sectors = first_bad - dev_sector;
 				if (good_sectors < max_sectors)
 					max_sectors = good_sectors;
 			}
@@ -4025,6 +4034,7 @@  static int raid10_set_queue_limits(struct mddev *mddev)
 	lim.max_write_zeroes_sectors = 0;
 	lim.io_min = mddev->chunk_sectors << 9;
 	lim.io_opt = lim.io_min * raid10_nr_stripes(conf);
+	lim.features |= BLK_FEAT_ATOMIC_WRITES_STACKED;
 	err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY);
 	if (err) {
 		queue_limits_cancel_update(mddev->gendisk->queue);