diff mbox series

[V3,5/7] ublk_drv: consider recovery feature in aborting mechanism

Message ID 20220913041707.197334-6-ZiyangZhang@linux.alibaba.com (mailing list archive)
State New, archived
Headers show
Series ublk_drv: add USER_RECOVERY support | expand

Commit Message

Ziyang Zhang Sept. 13, 2022, 4:17 a.m. UTC
With USER_RECOVERY feature enabled, the monitor_work schedules
quiesce_work after finding a dying ubq_daemon. The quiesce_work's job
is to:
(1) quiesce request queue.
(2) check if there is any INFLIGHT rq with UBLK_IO_FLAG_ACTIVE unset.
    If so, we retry until all these rqs are requeued by ublk_queue_rq()
    and task_work and become IDLE.
(3) requeue/abort inflight rqs issued to the crash ubq_daemon before. If
    UBLK_F_USER_RECOVERY_REISSUE is set, rq is requeued; or it is
    aborted.
(4) complete all ioucmds by calling io_uring_cmd_done(). We are safe to
    do so because no ioucmd can be referenced now.
(5) set ub's state to UBLK_S_DEV_QUIESCED, which means we are ready for
    recovery. This state is exposed to userspace by GET_DEV_INFO.

The driver can always handle STOP_DEV and cleanup everything no matter
ub's state is LIVE or QUIESCED. After ub's state is UBLK_S_DEV_QUIESCED,
user can recover with new process by sending START_USER_RECOVERY.

Note: we do not change the default behavior with reocvery feature
disabled. monitor_work still schedules stop_work and abort inflight
rqs. Finally ublk_device is released.

Signed-off-by: ZiyangZhang <ZiyangZhang@linux.alibaba.com>
---
 drivers/block/ublk_drv.c | 168 +++++++++++++++++++++++++++++++++++++--
 1 file changed, 161 insertions(+), 7 deletions(-)

Comments

Ming Lei Sept. 19, 2022, 9:32 a.m. UTC | #1
On Tue, Sep 13, 2022 at 12:17:05PM +0800, ZiyangZhang wrote:
> With USER_RECOVERY feature enabled, the monitor_work schedules
> quiesce_work after finding a dying ubq_daemon. The quiesce_work's job
> is to:
> (1) quiesce request queue.
> (2) check if there is any INFLIGHT rq with UBLK_IO_FLAG_ACTIVE unset.
>     If so, we retry until all these rqs are requeued by ublk_queue_rq()
>     and task_work and become IDLE.

These requests should be being handled by task work or the io_uring
fallback wq, and suggest to add the words here.

> (3) requeue/abort inflight rqs issued to the crash ubq_daemon before. If
>     UBLK_F_USER_RECOVERY_REISSUE is set, rq is requeued; or it is
>     aborted.
> (4) complete all ioucmds by calling io_uring_cmd_done(). We are safe to
>     do so because no ioucmd can be referenced now.
> (5) set ub's state to UBLK_S_DEV_QUIESCED, which means we are ready for
>     recovery. This state is exposed to userspace by GET_DEV_INFO.
> 
> The driver can always handle STOP_DEV and cleanup everything no matter
> ub's state is LIVE or QUIESCED. After ub's state is UBLK_S_DEV_QUIESCED,
> user can recover with new process by sending START_USER_RECOVERY.
> 
> Note: we do not change the default behavior with reocvery feature
> disabled. monitor_work still schedules stop_work and abort inflight
> rqs. Finally ublk_device is released.

This version looks much better than before.

> 
> Signed-off-by: ZiyangZhang <ZiyangZhang@linux.alibaba.com>
> ---
>  drivers/block/ublk_drv.c | 168 +++++++++++++++++++++++++++++++++++++--
>  1 file changed, 161 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
> index b067f33a1913..4409a130d0b6 100644
> --- a/drivers/block/ublk_drv.c
> +++ b/drivers/block/ublk_drv.c
> @@ -121,7 +121,7 @@ struct ublk_queue {
>  
>  	unsigned long io_addr;	/* mapped vm address */
>  	unsigned int max_io_sz;
> -	bool abort_work_pending;
> +	bool force_abort;
>  	unsigned short nr_io_ready;	/* how many ios setup */
>  	struct ublk_device *dev;
>  	struct ublk_io ios[0];
> @@ -163,6 +163,7 @@ struct ublk_device {
>  	 * monitor each queue's daemon periodically
>  	 */
>  	struct delayed_work	monitor_work;
> +	struct work_struct	quiesce_work;
>  	struct work_struct	stop_work;
>  };
>  
> @@ -660,6 +661,11 @@ static void __ublk_fail_req(struct ublk_io *io, struct request *req)
>  	WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE);
>  
>  	if (!(io->flags & UBLK_IO_FLAG_ABORTED)) {
> +		pr_devel("%s: abort rq: qid %d tag %d io_flags %x\n",
> +				__func__,
> +				((struct ublk_queue *)req->mq_hctx->driver_data)->q_id,

req->mq_hctx->queue_num is cleaner.

> +				req->tag,
> +				io->flags);
>  		io->flags |= UBLK_IO_FLAG_ABORTED;
>  		blk_mq_end_request(req, BLK_STS_IOERR);
>  	}
> @@ -820,6 +826,21 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
>  	res = ublk_setup_iod(ubq, rq);
>  	if (unlikely(res != BLK_STS_OK))
>  		return BLK_STS_IOERR;
> +    /* With recovery feature enabled, force_abort is set in
> +     * ublk_stop_dev() before calling del_gendisk() if ub's state
> +     * is QUIESCED. We have to abort all requeued and new rqs here
> +     * to let del_gendisk() move on. Besides, we do not call
> +     * io_uring_cmd_complete_in_task() to avoid UAF on io_uring ctx.
> +     *
> +     * Note: force_abort is guaranteed to be seen because it is set
> +     * before request queue is unqiuesced.
> +     */
> +	if (unlikely(ubq->force_abort)) {
> +		pr_devel("%s: abort rq: qid %d tag %d io_flags %x\n",
> +				__func__, ubq->q_id, rq->tag,
> +				ubq->ios[rq->tag].flags);
> +		return BLK_STS_IOERR;
> +	}
>  
>  	blk_mq_start_request(bd->rq);
>  
> @@ -1003,6 +1024,101 @@ static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq)
>  	ublk_put_device(ub);
>  }
>  
> +static bool ublk_check_inflight_rq(struct request *rq, void *data)
> +{
> +	struct ublk_queue *ubq = rq->mq_hctx->driver_data;
> +	struct ublk_io *io = &ubq->ios[rq->tag];
> +	bool *busy = data;
> +
> +	if (io->flags & UBLK_IO_FLAG_ACTIVE) {
> +		*busy = true;
> +		return false;
> +	}
> +	return true;
> +}
> +
> +static void ublk_wait_tagset_rqs_idle(struct ublk_device *ub)
> +{
> +	bool busy = false;
> +
> +	WARN_ON_ONCE(!blk_queue_quiesced(ub->ub_disk->queue));
> +	while (true) {
> +		blk_mq_tagset_busy_iter(&ub->tag_set,
> +				ublk_check_inflight_rq, &busy);
> +		if (busy)
> +			msleep(UBLK_REQUEUE_DELAY_MS);
> +		else
> +			break;
> +	}
> +}
> +
> +static void ublk_quiesce_queue(struct ublk_device *ub,
> +		struct ublk_queue *ubq)
> +{
> +	int i;
> +
> +	for (i = 0; i < ubq->q_depth; i++) {
> +		struct ublk_io *io = &ubq->ios[i];
> +
> +		if (!(io->flags & UBLK_IO_FLAG_ACTIVE)) {
> +			struct request *rq = blk_mq_tag_to_rq(
> +					ub->tag_set.tags[ubq->q_id], i);
> +
> +			WARN_ON_ONCE(!rq);
> +			pr_devel("%s: %s rq: qid %d tag %d io_flags %x\n", __func__,
> +					ublk_queue_can_use_recovery_reissue(ubq) ?
> +					"requeue" : "abort",
> +					ubq->q_id, i, io->flags);
> +			if (ublk_queue_can_use_recovery_reissue(ubq))
> +				blk_mq_requeue_request(rq, false);

This way is too violent.

There may be just one queue dying, but you requeue all requests
from any queue. I'd suggest to take the approach in ublk_daemon_monitor_work(),
such as, just requeuing requests in dying queue.

That said you still can re-use the logic in ublk_abort_queue()/ublk_daemon_monitor_work()
for making progress, just changing aborting request with requeue in
ublk_abort_queue().

> +			else
> +				__ublk_fail_req(io, rq);
> +		} else {
> +			pr_devel("%s: done old cmd: qid %d tag %d\n",
> +					__func__, ubq->q_id, i);
> +			io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, 0);
> +			io->flags &= ~UBLK_IO_FLAG_ACTIVE;
> +		}
> +		ubq->nr_io_ready--;
> +	}
> +	WARN_ON_ONCE(ubq->nr_io_ready);
> +}
> +
> +static void ublk_quiesce_dev(struct ublk_device *ub)
> +{
> +	int i;
> +
> +	mutex_lock(&ub->mutex);
> +	if (ub->dev_info.state != UBLK_S_DEV_LIVE)
> +		goto unlock;
> +
> +	for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
> +		struct ublk_queue *ubq = ublk_get_queue(ub, i);
> +
> +		if (!ubq_daemon_is_dying(ubq))
> +			goto unlock;
> +	}
> +	blk_mq_quiesce_queue(ub->ub_disk->queue);
> +	ublk_wait_tagset_rqs_idle(ub);
> +	pr_devel("%s: quiesce ub: dev_id %d\n",
> +			__func__, ub->dev_info.dev_id);
> +
> +	for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
> +		ublk_quiesce_queue(ub, ublk_get_queue(ub, i));
> +
> +	ub->dev_info.state = UBLK_S_DEV_QUIESCED;
> + unlock:
> +	mutex_unlock(&ub->mutex);
> +}
> +
> +static void ublk_quiesce_work_fn(struct work_struct *work)
> +{
> +	struct ublk_device *ub =
> +		container_of(work, struct ublk_device, quiesce_work);
> +
> +	ublk_quiesce_dev(ub);
> +}
> +
>  static void ublk_daemon_monitor_work(struct work_struct *work)
>  {
>  	struct ublk_device *ub =
> @@ -1013,10 +1129,14 @@ static void ublk_daemon_monitor_work(struct work_struct *work)
>  		struct ublk_queue *ubq = ublk_get_queue(ub, i);
>  
>  		if (ubq_daemon_is_dying(ubq)) {
> -			schedule_work(&ub->stop_work);
> -
> -			/* abort queue is for making forward progress */
> -			ublk_abort_queue(ub, ubq);
> +			if (ublk_queue_can_use_recovery(ubq)) {
> +				schedule_work(&ub->quiesce_work);
> +			} else {
> +				schedule_work(&ub->stop_work);
> +
> +				/* abort queue is for making forward progress */
> +				ublk_abort_queue(ub, ubq);
> +			}

If quiesce work are always scheduled exclusively with stop work,
the two can be defined as union, but not one big deal.


Thanks, 
Ming
Ziyang Zhang Sept. 19, 2022, 9:55 a.m. UTC | #2
On 2022/9/19 17:32, Ming Lei wrote:
> On Tue, Sep 13, 2022 at 12:17:05PM +0800, ZiyangZhang wrote:
>> With USER_RECOVERY feature enabled, the monitor_work schedules
>> quiesce_work after finding a dying ubq_daemon. The quiesce_work's job
>> is to:
>> (1) quiesce request queue.
>> (2) check if there is any INFLIGHT rq with UBLK_IO_FLAG_ACTIVE unset.
>>     If so, we retry until all these rqs are requeued by ublk_queue_rq()
>>     and task_work and become IDLE.
> 
> These requests should be being handled by task work or the io_uring
> fallback wq, and suggest to add the words here.

Will do so.

> 
>> (3) requeue/abort inflight rqs issued to the crash ubq_daemon before. If
>>     UBLK_F_USER_RECOVERY_REISSUE is set, rq is requeued; or it is
>>     aborted.
>> (4) complete all ioucmds by calling io_uring_cmd_done(). We are safe to
>>     do so because no ioucmd can be referenced now.
>> (5) set ub's state to UBLK_S_DEV_QUIESCED, which means we are ready for
>>     recovery. This state is exposed to userspace by GET_DEV_INFO.
>>
>> The driver can always handle STOP_DEV and cleanup everything no matter
>> ub's state is LIVE or QUIESCED. After ub's state is UBLK_S_DEV_QUIESCED,
>> user can recover with new process by sending START_USER_RECOVERY.
>>
>> Note: we do not change the default behavior with reocvery feature
>> disabled. monitor_work still schedules stop_work and abort inflight
>> rqs. Finally ublk_device is released.
> 
> This version looks much better than before.

Thanks :)

> 
>>
>> Signed-off-by: ZiyangZhang <ZiyangZhang@linux.alibaba.com>
>> ---
>>  drivers/block/ublk_drv.c | 168 +++++++++++++++++++++++++++++++++++++--
>>  1 file changed, 161 insertions(+), 7 deletions(-)
>>
>> diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
>> index b067f33a1913..4409a130d0b6 100644
>> --- a/drivers/block/ublk_drv.c
>> +++ b/drivers/block/ublk_drv.c
>> @@ -121,7 +121,7 @@ struct ublk_queue {
>>  
>>  	unsigned long io_addr;	/* mapped vm address */
>>  	unsigned int max_io_sz;
>> -	bool abort_work_pending;
>> +	bool force_abort;
>>  	unsigned short nr_io_ready;	/* how many ios setup */
>>  	struct ublk_device *dev;
>>  	struct ublk_io ios[0];
>> @@ -163,6 +163,7 @@ struct ublk_device {
>>  	 * monitor each queue's daemon periodically
>>  	 */
>>  	struct delayed_work	monitor_work;
>> +	struct work_struct	quiesce_work;
>>  	struct work_struct	stop_work;
>>  };
>>  
>> @@ -660,6 +661,11 @@ static void __ublk_fail_req(struct ublk_io *io, struct request *req)
>>  	WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE);
>>  
>>  	if (!(io->flags & UBLK_IO_FLAG_ABORTED)) {
>> +		pr_devel("%s: abort rq: qid %d tag %d io_flags %x\n",
>> +				__func__,
>> +				((struct ublk_queue *)req->mq_hctx->driver_data)->q_id,
> 
> req->mq_hctx->queue_num is cleaner.

Ok.

> 
>> +				req->tag,
>> +				io->flags);
>>  		io->flags |= UBLK_IO_FLAG_ABORTED;
>>  		blk_mq_end_request(req, BLK_STS_IOERR);
>>  	}
>> @@ -820,6 +826,21 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
>>  	res = ublk_setup_iod(ubq, rq);
>>  	if (unlikely(res != BLK_STS_OK))
>>  		return BLK_STS_IOERR;
>> +    /* With recovery feature enabled, force_abort is set in
>> +     * ublk_stop_dev() before calling del_gendisk() if ub's state
>> +     * is QUIESCED. We have to abort all requeued and new rqs here
>> +     * to let del_gendisk() move on. Besides, we do not call
>> +     * io_uring_cmd_complete_in_task() to avoid UAF on io_uring ctx.
>> +     *
>> +     * Note: force_abort is guaranteed to be seen because it is set
>> +     * before request queue is unqiuesced.
>> +     */
>> +	if (unlikely(ubq->force_abort)) {
>> +		pr_devel("%s: abort rq: qid %d tag %d io_flags %x\n",
>> +				__func__, ubq->q_id, rq->tag,
>> +				ubq->ios[rq->tag].flags);
>> +		return BLK_STS_IOERR;
>> +	}
>>  
>>  	blk_mq_start_request(bd->rq);
>>  
>> @@ -1003,6 +1024,101 @@ static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq)
>>  	ublk_put_device(ub);
>>  }
>>  
>> +static bool ublk_check_inflight_rq(struct request *rq, void *data)
>> +{
>> +	struct ublk_queue *ubq = rq->mq_hctx->driver_data;
>> +	struct ublk_io *io = &ubq->ios[rq->tag];
>> +	bool *busy = data;
>> +
>> +	if (io->flags & UBLK_IO_FLAG_ACTIVE) {
>> +		*busy = true;
>> +		return false;
>> +	}
>> +	return true;
>> +}
>> +
>> +static void ublk_wait_tagset_rqs_idle(struct ublk_device *ub)
>> +{
>> +	bool busy = false;
>> +
>> +	WARN_ON_ONCE(!blk_queue_quiesced(ub->ub_disk->queue));
>> +	while (true) {
>> +		blk_mq_tagset_busy_iter(&ub->tag_set,
>> +				ublk_check_inflight_rq, &busy);
>> +		if (busy)
>> +			msleep(UBLK_REQUEUE_DELAY_MS);
>> +		else
>> +			break;
>> +	}
>> +}
>> +
>> +static void ublk_quiesce_queue(struct ublk_device *ub,
>> +		struct ublk_queue *ubq)
>> +{
>> +	int i;
>> +
>> +	for (i = 0; i < ubq->q_depth; i++) {
>> +		struct ublk_io *io = &ubq->ios[i];
>> +
>> +		if (!(io->flags & UBLK_IO_FLAG_ACTIVE)) {
>> +			struct request *rq = blk_mq_tag_to_rq(
>> +					ub->tag_set.tags[ubq->q_id], i);
>> +
>> +			WARN_ON_ONCE(!rq);
>> +			pr_devel("%s: %s rq: qid %d tag %d io_flags %x\n", __func__,
>> +					ublk_queue_can_use_recovery_reissue(ubq) ?
>> +					"requeue" : "abort",
>> +					ubq->q_id, i, io->flags);
>> +			if (ublk_queue_can_use_recovery_reissue(ubq))
>> +				blk_mq_requeue_request(rq, false);
> 
> This way is too violent.
> 
> There may be just one queue dying, but you requeue all requests
> from any queue. I'd suggest to take the approach in ublk_daemon_monitor_work(),
> such as, just requeuing requests in dying queue.

If we want to start a new process after a crash for USER_RECOVERY, all old ubq_daemons
must exit and rqs of all queues have to be requeued/aborted. We cannot let live
ubq_daemons run any more because they do not belong to the new process.

BTW, I really wonder why there could be just one queue dying? All queues must be dying
shortly after any ubq_daemon is dying since they are all pthreads in the same process.

> 
> That said you still can re-use the logic in ublk_abort_queue()/ublk_daemon_monitor_work()
> for making progress, just changing aborting request with requeue in
> ublk_abort_queue().

I get your point, but it may be hard to reuse the logic in ublk_daemon_monitor_work()
because:
(1) we have to quiesce request queue in ublk_quiesce_dev(). This has to be done with
    ub_mutex.
(2) ublk_quiesce_dev() cannot be run after rqs are requeued/aborted.

> 
>> +			else
>> +				__ublk_fail_req(io, rq);
>> +		} else {
>> +			pr_devel("%s: done old cmd: qid %d tag %d\n",
>> +					__func__, ubq->q_id, i);
>> +			io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, 0);
>> +			io->flags &= ~UBLK_IO_FLAG_ACTIVE;
>> +		}
>> +		ubq->nr_io_ready--;
>> +	}
>> +	WARN_ON_ONCE(ubq->nr_io_ready);
>> +}
>> +
>> +static void ublk_quiesce_dev(struct ublk_device *ub)
>> +{
>> +	int i;
>> +
>> +	mutex_lock(&ub->mutex);
>> +	if (ub->dev_info.state != UBLK_S_DEV_LIVE)
>> +		goto unlock;
>> +
>> +	for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
>> +		struct ublk_queue *ubq = ublk_get_queue(ub, i);
>> +
>> +		if (!ubq_daemon_is_dying(ubq))
>> +			goto unlock;
>> +	}
>> +	blk_mq_quiesce_queue(ub->ub_disk->queue);
>> +	ublk_wait_tagset_rqs_idle(ub);
>> +	pr_devel("%s: quiesce ub: dev_id %d\n",
>> +			__func__, ub->dev_info.dev_id);
>> +
>> +	for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
>> +		ublk_quiesce_queue(ub, ublk_get_queue(ub, i));
>> +
>> +	ub->dev_info.state = UBLK_S_DEV_QUIESCED;
>> + unlock:
>> +	mutex_unlock(&ub->mutex);
>> +}
>> +
>> +static void ublk_quiesce_work_fn(struct work_struct *work)
>> +{
>> +	struct ublk_device *ub =
>> +		container_of(work, struct ublk_device, quiesce_work);
>> +
>> +	ublk_quiesce_dev(ub);
>> +}
>> +
>>  static void ublk_daemon_monitor_work(struct work_struct *work)
>>  {
>>  	struct ublk_device *ub =
>> @@ -1013,10 +1129,14 @@ static void ublk_daemon_monitor_work(struct work_struct *work)
>>  		struct ublk_queue *ubq = ublk_get_queue(ub, i);
>>  
>>  		if (ubq_daemon_is_dying(ubq)) {
>> -			schedule_work(&ub->stop_work);
>> -
>> -			/* abort queue is for making forward progress */
>> -			ublk_abort_queue(ub, ubq);
>> +			if (ublk_queue_can_use_recovery(ubq)) {
>> +				schedule_work(&ub->quiesce_work);
>> +			} else {
>> +				schedule_work(&ub->stop_work);
>> +
>> +				/* abort queue is for making forward progress */
>> +				ublk_abort_queue(ub, ubq);
>> +			}
> 
> If quiesce work are always scheduled exclusively with stop work,
> the two can be defined as union, but not one big deal.

OK.

Regards,
Zhang
Ming Lei Sept. 19, 2022, 12:33 p.m. UTC | #3
On Mon, Sep 19, 2022 at 05:55:05PM +0800, Ziyang Zhang wrote:
> On 2022/9/19 17:32, Ming Lei wrote:
> > On Tue, Sep 13, 2022 at 12:17:05PM +0800, ZiyangZhang wrote:
> >> With USER_RECOVERY feature enabled, the monitor_work schedules
> >> quiesce_work after finding a dying ubq_daemon. The quiesce_work's job
> >> is to:
> >> (1) quiesce request queue.
> >> (2) check if there is any INFLIGHT rq with UBLK_IO_FLAG_ACTIVE unset.
> >>     If so, we retry until all these rqs are requeued by ublk_queue_rq()
> >>     and task_work and become IDLE.
> > 
> > These requests should be being handled by task work or the io_uring
> > fallback wq, and suggest to add the words here.
> 
> Will do so.
> 
> > 
> >> (3) requeue/abort inflight rqs issued to the crash ubq_daemon before. If
> >>     UBLK_F_USER_RECOVERY_REISSUE is set, rq is requeued; or it is
> >>     aborted.
> >> (4) complete all ioucmds by calling io_uring_cmd_done(). We are safe to
> >>     do so because no ioucmd can be referenced now.
> >> (5) set ub's state to UBLK_S_DEV_QUIESCED, which means we are ready for
> >>     recovery. This state is exposed to userspace by GET_DEV_INFO.
> >>
> >> The driver can always handle STOP_DEV and cleanup everything no matter
> >> ub's state is LIVE or QUIESCED. After ub's state is UBLK_S_DEV_QUIESCED,
> >> user can recover with new process by sending START_USER_RECOVERY.
> >>
> >> Note: we do not change the default behavior with reocvery feature
> >> disabled. monitor_work still schedules stop_work and abort inflight
> >> rqs. Finally ublk_device is released.
> > 
> > This version looks much better than before.
> 
> Thanks :)
> 
> > 
> >>
> >> Signed-off-by: ZiyangZhang <ZiyangZhang@linux.alibaba.com>
> >> ---
> >>  drivers/block/ublk_drv.c | 168 +++++++++++++++++++++++++++++++++++++--
> >>  1 file changed, 161 insertions(+), 7 deletions(-)
> >>
> >> diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
> >> index b067f33a1913..4409a130d0b6 100644
> >> --- a/drivers/block/ublk_drv.c
> >> +++ b/drivers/block/ublk_drv.c
> >> @@ -121,7 +121,7 @@ struct ublk_queue {
> >>  
> >>  	unsigned long io_addr;	/* mapped vm address */
> >>  	unsigned int max_io_sz;
> >> -	bool abort_work_pending;
> >> +	bool force_abort;
> >>  	unsigned short nr_io_ready;	/* how many ios setup */
> >>  	struct ublk_device *dev;
> >>  	struct ublk_io ios[0];
> >> @@ -163,6 +163,7 @@ struct ublk_device {
> >>  	 * monitor each queue's daemon periodically
> >>  	 */
> >>  	struct delayed_work	monitor_work;
> >> +	struct work_struct	quiesce_work;
> >>  	struct work_struct	stop_work;
> >>  };
> >>  
> >> @@ -660,6 +661,11 @@ static void __ublk_fail_req(struct ublk_io *io, struct request *req)
> >>  	WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE);
> >>  
> >>  	if (!(io->flags & UBLK_IO_FLAG_ABORTED)) {
> >> +		pr_devel("%s: abort rq: qid %d tag %d io_flags %x\n",
> >> +				__func__,
> >> +				((struct ublk_queue *)req->mq_hctx->driver_data)->q_id,
> > 
> > req->mq_hctx->queue_num is cleaner.
> 
> Ok.
> 
> > 
> >> +				req->tag,
> >> +				io->flags);
> >>  		io->flags |= UBLK_IO_FLAG_ABORTED;
> >>  		blk_mq_end_request(req, BLK_STS_IOERR);
> >>  	}
> >> @@ -820,6 +826,21 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
> >>  	res = ublk_setup_iod(ubq, rq);
> >>  	if (unlikely(res != BLK_STS_OK))
> >>  		return BLK_STS_IOERR;
> >> +    /* With recovery feature enabled, force_abort is set in
> >> +     * ublk_stop_dev() before calling del_gendisk() if ub's state
> >> +     * is QUIESCED. We have to abort all requeued and new rqs here
> >> +     * to let del_gendisk() move on. Besides, we do not call
> >> +     * io_uring_cmd_complete_in_task() to avoid UAF on io_uring ctx.
> >> +     *
> >> +     * Note: force_abort is guaranteed to be seen because it is set
> >> +     * before request queue is unqiuesced.
> >> +     */
> >> +	if (unlikely(ubq->force_abort)) {
> >> +		pr_devel("%s: abort rq: qid %d tag %d io_flags %x\n",
> >> +				__func__, ubq->q_id, rq->tag,
> >> +				ubq->ios[rq->tag].flags);
> >> +		return BLK_STS_IOERR;
> >> +	}
> >>  
> >>  	blk_mq_start_request(bd->rq);
> >>  
> >> @@ -1003,6 +1024,101 @@ static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq)
> >>  	ublk_put_device(ub);
> >>  }
> >>  
> >> +static bool ublk_check_inflight_rq(struct request *rq, void *data)
> >> +{
> >> +	struct ublk_queue *ubq = rq->mq_hctx->driver_data;
> >> +	struct ublk_io *io = &ubq->ios[rq->tag];
> >> +	bool *busy = data;
> >> +
> >> +	if (io->flags & UBLK_IO_FLAG_ACTIVE) {
> >> +		*busy = true;
> >> +		return false;
> >> +	}
> >> +	return true;
> >> +}
> >> +
> >> +static void ublk_wait_tagset_rqs_idle(struct ublk_device *ub)
> >> +{
> >> +	bool busy = false;
> >> +
> >> +	WARN_ON_ONCE(!blk_queue_quiesced(ub->ub_disk->queue));
> >> +	while (true) {
> >> +		blk_mq_tagset_busy_iter(&ub->tag_set,
> >> +				ublk_check_inflight_rq, &busy);
> >> +		if (busy)
> >> +			msleep(UBLK_REQUEUE_DELAY_MS);
> >> +		else
> >> +			break;
> >> +	}
> >> +}
> >> +
> >> +static void ublk_quiesce_queue(struct ublk_device *ub,
> >> +		struct ublk_queue *ubq)
> >> +{
> >> +	int i;
> >> +
> >> +	for (i = 0; i < ubq->q_depth; i++) {
> >> +		struct ublk_io *io = &ubq->ios[i];
> >> +
> >> +		if (!(io->flags & UBLK_IO_FLAG_ACTIVE)) {
> >> +			struct request *rq = blk_mq_tag_to_rq(
> >> +					ub->tag_set.tags[ubq->q_id], i);
> >> +
> >> +			WARN_ON_ONCE(!rq);
> >> +			pr_devel("%s: %s rq: qid %d tag %d io_flags %x\n", __func__,
> >> +					ublk_queue_can_use_recovery_reissue(ubq) ?
> >> +					"requeue" : "abort",
> >> +					ubq->q_id, i, io->flags);
> >> +			if (ublk_queue_can_use_recovery_reissue(ubq))
> >> +				blk_mq_requeue_request(rq, false);
> > 
> > This way is too violent.
> > 
> > There may be just one queue dying, but you requeue all requests
> > from any queue. I'd suggest to take the approach in ublk_daemon_monitor_work(),
> > such as, just requeuing requests in dying queue.
> 
> If we want to start a new process after a crash for USER_RECOVERY, all old ubq_daemons
> must exit and rqs of all queues have to be requeued/aborted. We cannot let live
> ubq_daemons run any more because they do not belong to the new process.

IMO, the old process really can exist, and recently even I got such
requirement for switching queue from one thread to another.

What we should do is to get all inflight requests done, and cancel all io
commands, no matter if the ubq pthread is dead or live.

> 
> BTW, I really wonder why there could be just one queue dying? All queues must be dying
> shortly after any ubq_daemon is dying since they are all pthreads in the same process.

You can't assume it is always so. Maybe one pthread is dead first, and
others are dying later, maybe just one is dead.

If one queue's pthread is live, you may get trouble by simply requeuing
the request, that is why I suggest to re-use the logic of
ublk_daemon_monitor_work/ublk_abort_queue().

For stopping device, request queue is frozen in del_gendisk() and all
in-flight requests are drained, and monitor work provides such
guarantee.

For user recovery, monitor work should help you too by aborting one
queue if it is dying until all requests are drained.

> 
> > 
> > That said you still can re-use the logic in ublk_abort_queue()/ublk_daemon_monitor_work()
> > for making progress, just changing aborting request with requeue in
> > ublk_abort_queue().
> 
> I get your point, but it may be hard to reuse the logic in ublk_daemon_monitor_work()
> because:
> (1) we have to quiesce request queue in ublk_quiesce_dev(). This has to be done with
>     ub_mutex.
> (2) ublk_quiesce_dev() cannot be run after rqs are requeued/aborted.

I don't get your point, the request queue needs to be quiesced once, then
either inflight requests are requeued if the queue is dying, or completed by
the queue's pthread if it is live. As you mentioned, in reality, most times,
all pthreads will be killed, but timing can be different, and I think
you can not requeue one request if the ubq pthread isn't dying.

 

Thanks, 
Ming
Ziyang Zhang Sept. 20, 2022, 1:49 a.m. UTC | #4
On 2022/9/19 20:33, Ming Lei wrote:
>>>> +
>>>> +static void ublk_quiesce_queue(struct ublk_device *ub,
>>>> +		struct ublk_queue *ubq)
>>>> +{
>>>> +	int i;
>>>> +
>>>> +	for (i = 0; i < ubq->q_depth; i++) {
>>>> +		struct ublk_io *io = &ubq->ios[i];
>>>> +
>>>> +		if (!(io->flags & UBLK_IO_FLAG_ACTIVE)) {
>>>> +			struct request *rq = blk_mq_tag_to_rq(
>>>> +					ub->tag_set.tags[ubq->q_id], i);
>>>> +
>>>> +			WARN_ON_ONCE(!rq);
>>>> +			pr_devel("%s: %s rq: qid %d tag %d io_flags %x\n", __func__,
>>>> +					ublk_queue_can_use_recovery_reissue(ubq) ?
>>>> +					"requeue" : "abort",
>>>> +					ubq->q_id, i, io->flags);
>>>> +			if (ublk_queue_can_use_recovery_reissue(ubq))
>>>> +				blk_mq_requeue_request(rq, false);
>>>
>>> This way is too violent.
>>>
>>> There may be just one queue dying, but you requeue all requests
>>> from any queue. I'd suggest to take the approach in ublk_daemon_monitor_work(),
>>> such as, just requeuing requests in dying queue.
>>
>> If we want to start a new process after a crash for USER_RECOVERY, all old ubq_daemons
>> must exit and rqs of all queues have to be requeued/aborted. We cannot let live
>> ubq_daemons run any more because they do not belong to the new process.
> 
> IMO, the old process really can exist, and recently even I got such
> requirement for switching queue from one thread to another.

For now, only one process can open /dev/ublkcX, so a new process is necessary now.

If you think "per ubq_daemon" recovery is reasonable, I can do that in the future
if multiple processes is supported. But I really suggest that we can keep current
design as the first step which assumes all ubq_daemons are exited and a new process
is started, and that really meets our requirement.

BTW, START_USER_RECOVERY has to be reconsidered because we may need to pass a ubq_id
with it.

> 
> What we should do is to get all inflight requests done, and cancel all io
> commands, no matter if the ubq pthread is dead or live.
> 
>>
>> BTW, I really wonder why there could be just one queue dying? All queues must be dying
>> shortly after any ubq_daemon is dying since they are all pthreads in the same process.
> 
> You can't assume it is always so. Maybe one pthread is dead first, and
> others are dying later, maybe just one is dead.

Yes, I know there may be only one pthread is dead while others keep running, but now
ublk_drv only support one process opening the same /dev/ublkcX, so other pthreads
must dead(no matter they are aborted by signal or themselves) later.

> 
> If one queue's pthread is live, you may get trouble by simply requeuing
> the request, that is why I suggest to re-use the logic of
> ublk_daemon_monitor_work/ublk_abort_queue().

Actually, if any ubq_daemon is live, no rqs are requeued, please see the check in
ublk_quiesce_dev(). It always makes sure that ALL ubq_daemons are dying, then it
starts quiesce jobs.

> 
> For stopping device, request queue is frozen in del_gendisk() and all
> in-flight requests are drained, and monitor work provides such
> guarantee.
> 
> For user recovery, monitor work should help you too by aborting one
> queue if it is dying until all requests are drained.

Monitor work can schedule quiesce_work if it finds a dying ubq_daemon.
Then quiesce_work calls ublk_quiesce_dev(). I do this because ublk_quiesce_dev()
has to wait all inflight rqs with ACTIVE set being requeued.

> 
>>
>>>
>>> That said you still can re-use the logic in ublk_abort_queue()/ublk_daemon_monitor_work()
>>> for making progress, just changing aborting request with requeue in
>>> ublk_abort_queue().
>>
>> I get your point, but it may be hard to reuse the logic in ublk_daemon_monitor_work()
>> because:
>> (1) we have to quiesce request queue in ublk_quiesce_dev(). This has to be done with
>>     ub_mutex.
>> (2) ublk_quiesce_dev() cannot be run after rqs are requeued/aborted.
> 
> I don't get your point, the request queue needs to be quiesced once, then
> either inflight requests are requeued if the queue is dying, or completed by
> the queue's pthread if it is live. As you mentioned, in reality, most times,
> all pthreads will be killed, but timing can be different, and I think
> you can not requeue one request if the ubq pthread isn't dying.

I do not requeue rqs with a live ubq_daemon. ublk_quiesce_dev() always starts
after all ubq_daemons are dying.

Regards,
Zhang.
Ming Lei Sept. 20, 2022, 3:04 a.m. UTC | #5
On Tue, Sep 20, 2022 at 09:49:33AM +0800, Ziyang Zhang wrote:
> On 2022/9/19 20:33, Ming Lei wrote:
> >>>> +
> >>>> +static void ublk_quiesce_queue(struct ublk_device *ub,
> >>>> +		struct ublk_queue *ubq)
> >>>> +{
> >>>> +	int i;
> >>>> +
> >>>> +	for (i = 0; i < ubq->q_depth; i++) {
> >>>> +		struct ublk_io *io = &ubq->ios[i];
> >>>> +
> >>>> +		if (!(io->flags & UBLK_IO_FLAG_ACTIVE)) {
> >>>> +			struct request *rq = blk_mq_tag_to_rq(
> >>>> +					ub->tag_set.tags[ubq->q_id], i);
> >>>> +
> >>>> +			WARN_ON_ONCE(!rq);
> >>>> +			pr_devel("%s: %s rq: qid %d tag %d io_flags %x\n", __func__,
> >>>> +					ublk_queue_can_use_recovery_reissue(ubq) ?
> >>>> +					"requeue" : "abort",
> >>>> +					ubq->q_id, i, io->flags);
> >>>> +			if (ublk_queue_can_use_recovery_reissue(ubq))
> >>>> +				blk_mq_requeue_request(rq, false);
> >>>
> >>> This way is too violent.
> >>>
> >>> There may be just one queue dying, but you requeue all requests
> >>> from any queue. I'd suggest to take the approach in ublk_daemon_monitor_work(),
> >>> such as, just requeuing requests in dying queue.
> >>
> >> If we want to start a new process after a crash for USER_RECOVERY, all old ubq_daemons
> >> must exit and rqs of all queues have to be requeued/aborted. We cannot let live
> >> ubq_daemons run any more because they do not belong to the new process.
> > 
> > IMO, the old process really can exist, and recently even I got such
> > requirement for switching queue from one thread to another.
> 
> For now, only one process can open /dev/ublkcX, so a new process is necessary now.
> 
> If you think "per ubq_daemon" recovery is reasonable, I can do that in the future
> if multiple processes is supported. But I really suggest that we can keep current
> design as the first step which assumes all ubq_daemons are exited and a new process
> is started, and that really meets our requirement.
> 
> BTW, START_USER_RECOVERY has to be reconsidered because we may need to pass a ubq_id
> with it.
> 
> > 
> > What we should do is to get all inflight requests done, and cancel all io
> > commands, no matter if the ubq pthread is dead or live.
> > 
> >>
> >> BTW, I really wonder why there could be just one queue dying? All queues must be dying
> >> shortly after any ubq_daemon is dying since they are all pthreads in the same process.
> > 
> > You can't assume it is always so. Maybe one pthread is dead first, and
> > others are dying later, maybe just one is dead.
> 
> Yes, I know there may be only one pthread is dead while others keep running, but now
> ublk_drv only support one process opening the same /dev/ublkcX, so other pthreads
> must dead(no matter they are aborted by signal or themselves) later.
> 
> > 
> > If one queue's pthread is live, you may get trouble by simply requeuing
> > the request, that is why I suggest to re-use the logic of
> > ublk_daemon_monitor_work/ublk_abort_queue().
> 
> Actually, if any ubq_daemon is live, no rqs are requeued, please see the check in
> ublk_quiesce_dev(). It always makes sure that ALL ubq_daemons are dying, then it
> starts quiesce jobs.

OK, looks I miss this point, but you should have quiesced queue at the
beginning of ublk_quiesce_dev(), then the transition period can be kept
as short as possible. Otherwise, if one queue pthread isn't dying, the
device can be kept in this part-working state forever.

> 
> > 
> > For stopping device, request queue is frozen in del_gendisk() and all
> > in-flight requests are drained, and monitor work provides such
> > guarantee.
> > 
> > For user recovery, monitor work should help you too by aborting one
> > queue if it is dying until all requests are drained.
> 
> Monitor work can schedule quiesce_work if it finds a dying ubq_daemon.
> Then quiesce_work calls ublk_quiesce_dev(). I do this because ublk_quiesce_dev()
> has to wait all inflight rqs with ACTIVE set being requeued.
> 
> > 
> >>
> >>>
> >>> That said you still can re-use the logic in ublk_abort_queue()/ublk_daemon_monitor_work()
> >>> for making progress, just changing aborting request with requeue in
> >>> ublk_abort_queue().
> >>
> >> I get your point, but it may be hard to reuse the logic in ublk_daemon_monitor_work()
> >> because:
> >> (1) we have to quiesce request queue in ublk_quiesce_dev(). This has to be done with
> >>     ub_mutex.
> >> (2) ublk_quiesce_dev() cannot be run after rqs are requeued/aborted.

Follows the delta patch against patch 5 for showing the idea:


diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 4409a130d0b6..60c5786c4711 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -656,7 +656,8 @@ static void ublk_complete_rq(struct request *req)
  * Also aborting may not be started yet, keep in mind that one failed
  * request may be issued by block layer again.
  */
-static void __ublk_fail_req(struct ublk_io *io, struct request *req)
+static void __ublk_fail_req(struct ublk_queue *ubq, struct ublk_io *io,
+		struct request *req)
 {
 	WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE);
 
@@ -667,7 +668,10 @@ static void __ublk_fail_req(struct ublk_io *io, struct request *req)
 				req->tag,
 				io->flags);
 		io->flags |= UBLK_IO_FLAG_ABORTED;
-		blk_mq_end_request(req, BLK_STS_IOERR);
+		if (ublk_queue_can_use_recovery_reissue(ubq))
+			blk_mq_requeue_request(req, false);
+		else
+			blk_mq_end_request(req, BLK_STS_IOERR);
 	}
 }
 
@@ -1018,7 +1022,7 @@ static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq)
 			 */
 			rq = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], i);
 			if (rq)
-				__ublk_fail_req(io, rq);
+				__ublk_fail_req(ubq, io, rq);
 		}
 	}
 	ublk_put_device(ub);
@@ -1026,12 +1030,10 @@ static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq)
 
 static bool ublk_check_inflight_rq(struct request *rq, void *data)
 {
-	struct ublk_queue *ubq = rq->mq_hctx->driver_data;
-	struct ublk_io *io = &ubq->ios[rq->tag];
-	bool *busy = data;
+	bool *idle = data;
 
-	if (io->flags & UBLK_IO_FLAG_ACTIVE) {
-		*busy = true;
+	if (blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT) {
+		*idle = false;
 		return false;
 	}
 	return true;
@@ -1039,16 +1041,15 @@ static bool ublk_check_inflight_rq(struct request *rq, void *data)
 
 static void ublk_wait_tagset_rqs_idle(struct ublk_device *ub)
 {
-	bool busy = false;
+	bool idle = true;
 
 	WARN_ON_ONCE(!blk_queue_quiesced(ub->ub_disk->queue));
 	while (true) {
 		blk_mq_tagset_busy_iter(&ub->tag_set,
-				ublk_check_inflight_rq, &busy);
-		if (busy)
-			msleep(UBLK_REQUEUE_DELAY_MS);
-		else
+				ublk_check_inflight_rq, &idle);
+		if (idle)
 			break;
+		msleep(UBLK_REQUEUE_DELAY_MS);
 	}
 }
 
@@ -1069,10 +1070,7 @@ static void ublk_quiesce_queue(struct ublk_device *ub,
 					ublk_queue_can_use_recovery_reissue(ubq) ?
 					"requeue" : "abort",
 					ubq->q_id, i, io->flags);
-			if (ublk_queue_can_use_recovery_reissue(ubq))
-				blk_mq_requeue_request(rq, false);
-			else
-				__ublk_fail_req(io, rq);
+			__ublk_fail_req(ubq, io, rq);
 		} else {
 			pr_devel("%s: done old cmd: qid %d tag %d\n",
 					__func__, ubq->q_id, i);
@@ -1092,12 +1090,6 @@ static void ublk_quiesce_dev(struct ublk_device *ub)
 	if (ub->dev_info.state != UBLK_S_DEV_LIVE)
 		goto unlock;
 
-	for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
-		struct ublk_queue *ubq = ublk_get_queue(ub, i);
-
-		if (!ubq_daemon_is_dying(ubq))
-			goto unlock;
-	}
 	blk_mq_quiesce_queue(ub->ub_disk->queue);
 	ublk_wait_tagset_rqs_idle(ub);
 	pr_devel("%s: quiesce ub: dev_id %d\n",
@@ -1129,14 +1121,13 @@ static void ublk_daemon_monitor_work(struct work_struct *work)
 		struct ublk_queue *ubq = ublk_get_queue(ub, i);
 
 		if (ubq_daemon_is_dying(ubq)) {
-			if (ublk_queue_can_use_recovery(ubq)) {
+			if (ublk_queue_can_use_recovery(ubq))
 				schedule_work(&ub->quiesce_work);
-			} else {
+			else
 				schedule_work(&ub->stop_work);
 
-				/* abort queue is for making forward progress */
-				ublk_abort_queue(ub, ubq);
-			}
+			/* abort queue is for making forward progress */
+			ublk_abort_queue(ub, ubq);
 		}
 	}
 




Thanks,
Ming
Ziyang Zhang Sept. 20, 2022, 3:24 a.m. UTC | #6
On 2022/9/20 11:04, Ming Lei wrote:
> On Tue, Sep 20, 2022 at 09:49:33AM +0800, Ziyang Zhang wrote:
> 
> Follows the delta patch against patch 5 for showing the idea:
> 
> 
> diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
> index 4409a130d0b6..60c5786c4711 100644
> --- a/drivers/block/ublk_drv.c
> +++ b/drivers/block/ublk_drv.c
> @@ -656,7 +656,8 @@ static void ublk_complete_rq(struct request *req)
>   * Also aborting may not be started yet, keep in mind that one failed
>   * request may be issued by block layer again.
>   */
> -static void __ublk_fail_req(struct ublk_io *io, struct request *req)
> +static void __ublk_fail_req(struct ublk_queue *ubq, struct ublk_io *io,
> +		struct request *req)
>  {
>  	WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE);
>  
> @@ -667,7 +668,10 @@ static void __ublk_fail_req(struct ublk_io *io, struct request *req)
>  				req->tag,
>  				io->flags);
>  		io->flags |= UBLK_IO_FLAG_ABORTED;
> -		blk_mq_end_request(req, BLK_STS_IOERR);
> +		if (ublk_queue_can_use_recovery_reissue(ubq))
> +			blk_mq_requeue_request(req, false);

Here is one problem:
We reset io->flags to 0 in ublk_queue_reinit() and it is called before new
ubq_daemon with FETCH_REQ is accepted. ublk_abort_queue() is not protected with
ub_mutex and it is called many times in monitor_work. So same rq may be requeued
multiple times.

With recovery disabled, there is no such problem since io->flags does not change
until ublk_dev is released.

In my patch 5 I only requeue the same rq once. So re-using ublk_abort_queue() is
hard for recovery feature.
 
> +		else
> +			blk_mq_end_request(req, BLK_STS_IOERR);
>  	}
>  }
>  
> @@ -1018,7 +1022,7 @@ static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq)
>  			 */
>  			rq = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], i);
>  			if (rq)
> -				__ublk_fail_req(io, rq);
> +				__ublk_fail_req(ubq, io, rq);
>  		}
>  	}
>  	ublk_put_device(ub);
> @@ -1026,12 +1030,10 @@ static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq)
>  
>  static bool ublk_check_inflight_rq(struct request *rq, void *data)
>  {
> -	struct ublk_queue *ubq = rq->mq_hctx->driver_data;
> -	struct ublk_io *io = &ubq->ios[rq->tag];
> -	bool *busy = data;
> +	bool *idle = data;
>  
> -	if (io->flags & UBLK_IO_FLAG_ACTIVE) {
> -		*busy = true;
> +	if (blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT) {
> +		*idle = false;
>  		return false;
>  	}
>  	return true;
> @@ -1039,16 +1041,15 @@ static bool ublk_check_inflight_rq(struct request *rq, void *data)
>  
>  static void ublk_wait_tagset_rqs_idle(struct ublk_device *ub)
>  {
> -	bool busy = false;
> +	bool idle = true;
>  
>  	WARN_ON_ONCE(!blk_queue_quiesced(ub->ub_disk->queue));
>  	while (true) {
>  		blk_mq_tagset_busy_iter(&ub->tag_set,
> -				ublk_check_inflight_rq, &busy);
> -		if (busy)
> -			msleep(UBLK_REQUEUE_DELAY_MS);
> -		else
> +				ublk_check_inflight_rq, &idle);
> +		if (idle)
>  			break;
> +		msleep(UBLK_REQUEUE_DELAY_MS);
>  	}
>  }
>  
> @@ -1069,10 +1070,7 @@ static void ublk_quiesce_queue(struct ublk_device *ub,
>  					ublk_queue_can_use_recovery_reissue(ubq) ?
>  					"requeue" : "abort",
>  					ubq->q_id, i, io->flags);
> -			if (ublk_queue_can_use_recovery_reissue(ubq))
> -				blk_mq_requeue_request(rq, false);
> -			else
> -				__ublk_fail_req(io, rq);
> +			__ublk_fail_req(ubq, io, rq);
>  		} else {
>  			pr_devel("%s: done old cmd: qid %d tag %d\n",
>  					__func__, ubq->q_id, i);
> @@ -1092,12 +1090,6 @@ static void ublk_quiesce_dev(struct ublk_device *ub)
>  	if (ub->dev_info.state != UBLK_S_DEV_LIVE)
>  		goto unlock;
>  
> -	for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
> -		struct ublk_queue *ubq = ublk_get_queue(ub, i);
> -
> -		if (!ubq_daemon_is_dying(ubq))
> -			goto unlock;
> -	}
>  	blk_mq_quiesce_queue(ub->ub_disk->queue);
>  	ublk_wait_tagset_rqs_idle(ub);
>  	pr_devel("%s: quiesce ub: dev_id %d\n",
> @@ -1129,14 +1121,13 @@ static void ublk_daemon_monitor_work(struct work_struct *work)
>  		struct ublk_queue *ubq = ublk_get_queue(ub, i);
>  
>  		if (ubq_daemon_is_dying(ubq)) {
> -			if (ublk_queue_can_use_recovery(ubq)) {
> +			if (ublk_queue_can_use_recovery(ubq))
>  				schedule_work(&ub->quiesce_work);
> -			} else {
> +			else
>  				schedule_work(&ub->stop_work);
>  
> -				/* abort queue is for making forward progress */
> -				ublk_abort_queue(ub, ubq);
> -			}
> +			/* abort queue is for making forward progress */
> +			ublk_abort_queue(ub, ubq);
>  		}
>  	}
>  
> 
> 
> 
> 
> Thanks,
> Ming
Ming Lei Sept. 20, 2022, 4:01 a.m. UTC | #7
On Tue, Sep 20, 2022 at 11:24:12AM +0800, Ziyang Zhang wrote:
> On 2022/9/20 11:04, Ming Lei wrote:
> > On Tue, Sep 20, 2022 at 09:49:33AM +0800, Ziyang Zhang wrote:
> > 
> > Follows the delta patch against patch 5 for showing the idea:
> > 
> > 
> > diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
> > index 4409a130d0b6..60c5786c4711 100644
> > --- a/drivers/block/ublk_drv.c
> > +++ b/drivers/block/ublk_drv.c
> > @@ -656,7 +656,8 @@ static void ublk_complete_rq(struct request *req)
> >   * Also aborting may not be started yet, keep in mind that one failed
> >   * request may be issued by block layer again.
> >   */
> > -static void __ublk_fail_req(struct ublk_io *io, struct request *req)
> > +static void __ublk_fail_req(struct ublk_queue *ubq, struct ublk_io *io,
> > +		struct request *req)
> >  {
> >  	WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE);
> >  
> > @@ -667,7 +668,10 @@ static void __ublk_fail_req(struct ublk_io *io, struct request *req)
> >  				req->tag,
> >  				io->flags);
> >  		io->flags |= UBLK_IO_FLAG_ABORTED;
> > -		blk_mq_end_request(req, BLK_STS_IOERR);
> > +		if (ublk_queue_can_use_recovery_reissue(ubq))
> > +			blk_mq_requeue_request(req, false);
> 
> Here is one problem:
> We reset io->flags to 0 in ublk_queue_reinit() and it is called before new

As we agreed, ublk_queue_reinit() will be moved to ublk_ch_release(), when there isn't
any inflight request, which is completed by either ublk server or __ublk_fail_req().

So clearing io->flags isn't related with quisceing device.

> ubq_daemon with FETCH_REQ is accepted. ublk_abort_queue() is not protected with
> ub_mutex and it is called many times in monitor_work. So same rq may be requeued
> multiple times.

UBLK_IO_FLAG_ABORTED is set for the slot, so one req is only ended or
requeued just once.

> 
> With recovery disabled, there is no such problem since io->flags does not change
> until ublk_dev is released.

But we have agreed that ublk_queue_reinit() can be moved to release
handler of /dev/ublkcN.

> 
> In my patch 5 I only requeue the same rq once. So re-using ublk_abort_queue() is
> hard for recovery feature.

No, the same rq is just requeued once. Here the point is:

1) reuse previous pattern in ublk_stop_dev(), which is proved as
workable reliably

2) avoid to stay in half-working state forever

3) the behind idea is more simpler.


Thanks.
Ming
Ziyang Zhang Sept. 20, 2022, 4:39 a.m. UTC | #8
On 2022/9/20 12:01, Ming Lei wrote:
> On Tue, Sep 20, 2022 at 11:24:12AM +0800, Ziyang Zhang wrote:
>> On 2022/9/20 11:04, Ming Lei wrote:
>>> On Tue, Sep 20, 2022 at 09:49:33AM +0800, Ziyang Zhang wrote:
>>>
>>> Follows the delta patch against patch 5 for showing the idea:
>>>
>>>
>>> diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
>>> index 4409a130d0b6..60c5786c4711 100644
>>> --- a/drivers/block/ublk_drv.c
>>> +++ b/drivers/block/ublk_drv.c
>>> @@ -656,7 +656,8 @@ static void ublk_complete_rq(struct request *req)
>>>   * Also aborting may not be started yet, keep in mind that one failed
>>>   * request may be issued by block layer again.
>>>   */
>>> -static void __ublk_fail_req(struct ublk_io *io, struct request *req)
>>> +static void __ublk_fail_req(struct ublk_queue *ubq, struct ublk_io *io,
>>> +		struct request *req)
>>>  {
>>>  	WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE);
>>>  
>>> @@ -667,7 +668,10 @@ static void __ublk_fail_req(struct ublk_io *io, struct request *req)
>>>  				req->tag,
>>>  				io->flags);
>>>  		io->flags |= UBLK_IO_FLAG_ABORTED;
>>> -		blk_mq_end_request(req, BLK_STS_IOERR);
>>> +		if (ublk_queue_can_use_recovery_reissue(ubq))
>>> +			blk_mq_requeue_request(req, false);
>>
>> Here is one problem:
>> We reset io->flags to 0 in ublk_queue_reinit() and it is called before new
> 
> As we agreed, ublk_queue_reinit() will be moved to ublk_ch_release(), when there isn't
> any inflight request, which is completed by either ublk server or __ublk_fail_req().
> 
> So clearing io->flags isn't related with quisceing device.
> 
>> ubq_daemon with FETCH_REQ is accepted. ublk_abort_queue() is not protected with
>> ub_mutex and it is called many times in monitor_work. So same rq may be requeued
>> multiple times.
> 
> UBLK_IO_FLAG_ABORTED is set for the slot, so one req is only ended or
> requeued just once.

Yes, we can move ublk_queue_reinit() into ublk_ch_release(), but monitor_work is scheduled
periodically so ublk_abort_queue() is called multiple times. As ublk_queue_reinit() clear
io->flags, ublk_abort_queue() can requeue the same rq twice. Note that monitor_work can be
scheduled after ublk_ch_release().
 
> 
>>
>> With recovery disabled, there is no such problem since io->flags does not change
>> until ublk_dev is released.
> 
> But we have agreed that ublk_queue_reinit() can be moved to release
> handler of /dev/ublkcN.
> 
>>
>> In my patch 5 I only requeue the same rq once. So re-using ublk_abort_queue() is
>> hard for recovery feature.
> 
> No, the same rq is just requeued once. Here the point is:
> 
> 1) reuse previous pattern in ublk_stop_dev(), which is proved as
> workable reliably
> 
> 2) avoid to stay in half-working state forever
> 
> 3) the behind idea is more simpler.

Ming, your patch requeue rqs with ACTVE unset. these rqs have been issued to the
dying ubq_daemon. What I concern about is inflight rqs with ACTIVE set.

Regards,
Zhang.
Ziyang Zhang Sept. 20, 2022, 4:45 a.m. UTC | #9
On 2022/9/20 11:04, Ming Lei wrote:
> On Tue, Sep 20, 2022 at 09:49:33AM +0800, Ziyang Zhang wrote:
>> On 2022/9/19 20:33, Ming Lei wrote:
>>>>>> +
>>>>>> +static void ublk_quiesce_queue(struct ublk_device *ub,
>>>>>> +		struct ublk_queue *ubq)
>>>>>> +{
>>>>>> +	int i;
>>>>>> +
>>>>>> +	for (i = 0; i < ubq->q_depth; i++) {
>>>>>> +		struct ublk_io *io = &ubq->ios[i];
>>>>>> +
>>>>>> +		if (!(io->flags & UBLK_IO_FLAG_ACTIVE)) {
>>>>>> +			struct request *rq = blk_mq_tag_to_rq(
>>>>>> +					ub->tag_set.tags[ubq->q_id], i);
>>>>>> +
>>>>>> +			WARN_ON_ONCE(!rq);
>>>>>> +			pr_devel("%s: %s rq: qid %d tag %d io_flags %x\n", __func__,
>>>>>> +					ublk_queue_can_use_recovery_reissue(ubq) ?
>>>>>> +					"requeue" : "abort",
>>>>>> +					ubq->q_id, i, io->flags);
>>>>>> +			if (ublk_queue_can_use_recovery_reissue(ubq))
>>>>>> +				blk_mq_requeue_request(rq, false);
>>>>>
>>>>> This way is too violent.
>>>>>
>>>>> There may be just one queue dying, but you requeue all requests
>>>>> from any queue. I'd suggest to take the approach in ublk_daemon_monitor_work(),
>>>>> such as, just requeuing requests in dying queue.
>>>>
>>>> If we want to start a new process after a crash for USER_RECOVERY, all old ubq_daemons
>>>> must exit and rqs of all queues have to be requeued/aborted. We cannot let live
>>>> ubq_daemons run any more because they do not belong to the new process.
>>>
>>> IMO, the old process really can exist, and recently even I got such
>>> requirement for switching queue from one thread to another.
>>
>> For now, only one process can open /dev/ublkcX, so a new process is necessary now.
>>
>> If you think "per ubq_daemon" recovery is reasonable, I can do that in the future
>> if multiple processes is supported. But I really suggest that we can keep current
>> design as the first step which assumes all ubq_daemons are exited and a new process
>> is started, and that really meets our requirement.
>>
>> BTW, START_USER_RECOVERY has to be reconsidered because we may need to pass a ubq_id
>> with it.
>>
>>>
>>> What we should do is to get all inflight requests done, and cancel all io
>>> commands, no matter if the ubq pthread is dead or live.
>>>
>>>>
>>>> BTW, I really wonder why there could be just one queue dying? All queues must be dying
>>>> shortly after any ubq_daemon is dying since they are all pthreads in the same process.
>>>
>>> You can't assume it is always so. Maybe one pthread is dead first, and
>>> others are dying later, maybe just one is dead.
>>
>> Yes, I know there may be only one pthread is dead while others keep running, but now
>> ublk_drv only support one process opening the same /dev/ublkcX, so other pthreads
>> must dead(no matter they are aborted by signal or themselves) later.
>>
>>>
>>> If one queue's pthread is live, you may get trouble by simply requeuing
>>> the request, that is why I suggest to re-use the logic of
>>> ublk_daemon_monitor_work/ublk_abort_queue().
>>
>> Actually, if any ubq_daemon is live, no rqs are requeued, please see the check in
>> ublk_quiesce_dev(). It always makes sure that ALL ubq_daemons are dying, then it
>> starts quiesce jobs.
> 
> OK, looks I miss this point, but you should have quiesced queue at the
> beginning of ublk_quiesce_dev(), then the transition period can be kept
> as short as possible. Otherwise, if one queue pthread isn't dying, the
> device can be kept in this part-working state forever.
> 

Ming, this is what you said in PATCH V2:
"
The simplest handling might be to exit all ublk queues first, and re-create one
new process to recover all since the request queue is required to be
quiesced first, and all ublk queue is actually quiesced too. So from user
viewpoint, there is nothing visible comparing with just recovering
single ubq daemon/pthread.
"

So I assume that quiesce_work starts only after all ubq_damons are dying.
Note that current ublk does not support mutpile process opening the same chardev.

Really we should agree on this. My suggestion is that we only consider "all ubq_daemons
are dying".

You mention that someone want to enable "switch ubq_daemon pthread to another one" and
I think it is another feature but not recovery feature.

Regards,
Zhang.
Ming Lei Sept. 20, 2022, 4:49 a.m. UTC | #10
On Tue, Sep 20, 2022 at 12:39:31PM +0800, Ziyang Zhang wrote:
> On 2022/9/20 12:01, Ming Lei wrote:
> > On Tue, Sep 20, 2022 at 11:24:12AM +0800, Ziyang Zhang wrote:
> >> On 2022/9/20 11:04, Ming Lei wrote:
> >>> On Tue, Sep 20, 2022 at 09:49:33AM +0800, Ziyang Zhang wrote:
> >>>
> >>> Follows the delta patch against patch 5 for showing the idea:
> >>>
> >>>
> >>> diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
> >>> index 4409a130d0b6..60c5786c4711 100644
> >>> --- a/drivers/block/ublk_drv.c
> >>> +++ b/drivers/block/ublk_drv.c
> >>> @@ -656,7 +656,8 @@ static void ublk_complete_rq(struct request *req)
> >>>   * Also aborting may not be started yet, keep in mind that one failed
> >>>   * request may be issued by block layer again.
> >>>   */
> >>> -static void __ublk_fail_req(struct ublk_io *io, struct request *req)
> >>> +static void __ublk_fail_req(struct ublk_queue *ubq, struct ublk_io *io,
> >>> +		struct request *req)
> >>>  {
> >>>  	WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE);
> >>>  
> >>> @@ -667,7 +668,10 @@ static void __ublk_fail_req(struct ublk_io *io, struct request *req)
> >>>  				req->tag,
> >>>  				io->flags);
> >>>  		io->flags |= UBLK_IO_FLAG_ABORTED;
> >>> -		blk_mq_end_request(req, BLK_STS_IOERR);
> >>> +		if (ublk_queue_can_use_recovery_reissue(ubq))
> >>> +			blk_mq_requeue_request(req, false);
> >>
> >> Here is one problem:
> >> We reset io->flags to 0 in ublk_queue_reinit() and it is called before new
> > 
> > As we agreed, ublk_queue_reinit() will be moved to ublk_ch_release(), when there isn't
> > any inflight request, which is completed by either ublk server or __ublk_fail_req().
> > 
> > So clearing io->flags isn't related with quisceing device.
> > 
> >> ubq_daemon with FETCH_REQ is accepted. ublk_abort_queue() is not protected with
> >> ub_mutex and it is called many times in monitor_work. So same rq may be requeued
> >> multiple times.
> > 
> > UBLK_IO_FLAG_ABORTED is set for the slot, so one req is only ended or
> > requeued just once.
> 
> Yes, we can move ublk_queue_reinit() into ublk_ch_release(), but monitor_work is scheduled
> periodically so ublk_abort_queue() is called multiple times. As ublk_queue_reinit() clear
> io->flags, ublk_abort_queue() can requeue the same rq twice. Note that monitor_work can be
> scheduled after ublk_ch_release().

No, monitor work is supposed to be shutdown after in-flight requests are
drained.

>  
> > 
> >>
> >> With recovery disabled, there is no such problem since io->flags does not change
> >> until ublk_dev is released.
> > 
> > But we have agreed that ublk_queue_reinit() can be moved to release
> > handler of /dev/ublkcN.
> > 
> >>
> >> In my patch 5 I only requeue the same rq once. So re-using ublk_abort_queue() is
> >> hard for recovery feature.
> > 
> > No, the same rq is just requeued once. Here the point is:
> > 
> > 1) reuse previous pattern in ublk_stop_dev(), which is proved as
> > workable reliably
> > 
> > 2) avoid to stay in half-working state forever
> > 
> > 3) the behind idea is more simpler.
> 
> Ming, your patch requeue rqs with ACTVE unset. these rqs have been issued to the
> dying ubq_daemon. What I concern about is inflight rqs with ACTIVE set.

My patch drains all inflight requests no matter if ACTIVE is set or not,
and that is the reason why it is simpler.

Thanks,
Ming
Ziyang Zhang Sept. 20, 2022, 5:03 a.m. UTC | #11
On 2022/9/20 12:49, Ming Lei wrote:
> On Tue, Sep 20, 2022 at 12:39:31PM +0800, Ziyang Zhang wrote:
>> On 2022/9/20 12:01, Ming Lei wrote:
>>> On Tue, Sep 20, 2022 at 11:24:12AM +0800, Ziyang Zhang wrote:
>>>> On 2022/9/20 11:04, Ming Lei wrote:
>>>>> On Tue, Sep 20, 2022 at 09:49:33AM +0800, Ziyang Zhang wrote:
>>>>>
>>>>> Follows the delta patch against patch 5 for showing the idea:
>>>>>
>>>>>
>>>>> diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
>>>>> index 4409a130d0b6..60c5786c4711 100644
>>>>> --- a/drivers/block/ublk_drv.c
>>>>> +++ b/drivers/block/ublk_drv.c
>>>>> @@ -656,7 +656,8 @@ static void ublk_complete_rq(struct request *req)
>>>>>   * Also aborting may not be started yet, keep in mind that one failed
>>>>>   * request may be issued by block layer again.
>>>>>   */
>>>>> -static void __ublk_fail_req(struct ublk_io *io, struct request *req)
>>>>> +static void __ublk_fail_req(struct ublk_queue *ubq, struct ublk_io *io,
>>>>> +		struct request *req)
>>>>>  {
>>>>>  	WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE);
>>>>>  
>>>>> @@ -667,7 +668,10 @@ static void __ublk_fail_req(struct ublk_io *io, struct request *req)
>>>>>  				req->tag,
>>>>>  				io->flags);
>>>>>  		io->flags |= UBLK_IO_FLAG_ABORTED;
>>>>> -		blk_mq_end_request(req, BLK_STS_IOERR);
>>>>> +		if (ublk_queue_can_use_recovery_reissue(ubq))
>>>>> +			blk_mq_requeue_request(req, false);
>>>>
>>>> Here is one problem:
>>>> We reset io->flags to 0 in ublk_queue_reinit() and it is called before new
>>>
>>> As we agreed, ublk_queue_reinit() will be moved to ublk_ch_release(), when there isn't
>>> any inflight request, which is completed by either ublk server or __ublk_fail_req().
>>>
>>> So clearing io->flags isn't related with quisceing device.
>>>
>>>> ubq_daemon with FETCH_REQ is accepted. ublk_abort_queue() is not protected with
>>>> ub_mutex and it is called many times in monitor_work. So same rq may be requeued
>>>> multiple times.
>>>
>>> UBLK_IO_FLAG_ABORTED is set for the slot, so one req is only ended or
>>> requeued just once.
>>
>> Yes, we can move ublk_queue_reinit() into ublk_ch_release(), but monitor_work is scheduled
>> periodically so ublk_abort_queue() is called multiple times. As ublk_queue_reinit() clear
>> io->flags, ublk_abort_queue() can requeue the same rq twice. Note that monitor_work can be
>> scheduled after ublk_ch_release().
> 
> No, monitor work is supposed to be shutdown after in-flight requests are
> drained.


Let's add cancel_delayed_work_sync(&ub->monitor_work) in ublk_ch_release().
monitor_work should not be scheduled after ub's state is QUIESCED.

Regards,
Zhang.
Ziyang Zhang Sept. 20, 2022, 5:05 a.m. UTC | #12
On 2022/9/20 12:45, Ziyang Zhang wrote:
> On 2022/9/20 11:04, Ming Lei wrote:
>> On Tue, Sep 20, 2022 at 09:49:33AM +0800, Ziyang Zhang wrote:
>>> On 2022/9/19 20:33, Ming Lei wrote:
>>>>>>> +
>>>>>>> +static void ublk_quiesce_queue(struct ublk_device *ub,
>>>>>>> +		struct ublk_queue *ubq)
>>>>>>> +{
>>>>>>> +	int i;
>>>>>>> +
>>>>>>> +	for (i = 0; i < ubq->q_depth; i++) {
>>>>>>> +		struct ublk_io *io = &ubq->ios[i];
>>>>>>> +
>>>>>>> +		if (!(io->flags & UBLK_IO_FLAG_ACTIVE)) {
>>>>>>> +			struct request *rq = blk_mq_tag_to_rq(
>>>>>>> +					ub->tag_set.tags[ubq->q_id], i);
>>>>>>> +
>>>>>>> +			WARN_ON_ONCE(!rq);
>>>>>>> +			pr_devel("%s: %s rq: qid %d tag %d io_flags %x\n", __func__,
>>>>>>> +					ublk_queue_can_use_recovery_reissue(ubq) ?
>>>>>>> +					"requeue" : "abort",
>>>>>>> +					ubq->q_id, i, io->flags);
>>>>>>> +			if (ublk_queue_can_use_recovery_reissue(ubq))
>>>>>>> +				blk_mq_requeue_request(rq, false);
>>>>>>
>>>>>> This way is too violent.
>>>>>>
>>>>>> There may be just one queue dying, but you requeue all requests
>>>>>> from any queue. I'd suggest to take the approach in ublk_daemon_monitor_work(),
>>>>>> such as, just requeuing requests in dying queue.
>>>>>
>>>>> If we want to start a new process after a crash for USER_RECOVERY, all old ubq_daemons
>>>>> must exit and rqs of all queues have to be requeued/aborted. We cannot let live
>>>>> ubq_daemons run any more because they do not belong to the new process.
>>>>
>>>> IMO, the old process really can exist, and recently even I got such
>>>> requirement for switching queue from one thread to another.
>>>
>>> For now, only one process can open /dev/ublkcX, so a new process is necessary now.
>>>
>>> If you think "per ubq_daemon" recovery is reasonable, I can do that in the future
>>> if multiple processes is supported. But I really suggest that we can keep current
>>> design as the first step which assumes all ubq_daemons are exited and a new process
>>> is started, and that really meets our requirement.
>>>
>>> BTW, START_USER_RECOVERY has to be reconsidered because we may need to pass a ubq_id
>>> with it.
>>>
>>>>
>>>> What we should do is to get all inflight requests done, and cancel all io
>>>> commands, no matter if the ubq pthread is dead or live.
>>>>
>>>>>
>>>>> BTW, I really wonder why there could be just one queue dying? All queues must be dying
>>>>> shortly after any ubq_daemon is dying since they are all pthreads in the same process.
>>>>
>>>> You can't assume it is always so. Maybe one pthread is dead first, and
>>>> others are dying later, maybe just one is dead.
>>>
>>> Yes, I know there may be only one pthread is dead while others keep running, but now
>>> ublk_drv only support one process opening the same /dev/ublkcX, so other pthreads
>>> must dead(no matter they are aborted by signal or themselves) later.
>>>
>>>>
>>>> If one queue's pthread is live, you may get trouble by simply requeuing
>>>> the request, that is why I suggest to re-use the logic of
>>>> ublk_daemon_monitor_work/ublk_abort_queue().
>>>
>>> Actually, if any ubq_daemon is live, no rqs are requeued, please see the check in
>>> ublk_quiesce_dev(). It always makes sure that ALL ubq_daemons are dying, then it
>>> starts quiesce jobs.
>>
>> OK, looks I miss this point, but you should have quiesced queue at the
>> beginning of ublk_quiesce_dev(), then the transition period can be kept
>> as short as possible. Otherwise, if one queue pthread isn't dying, the
>> device can be kept in this part-working state forever.
>>
> 
> Ming, this is what you said in PATCH V2:
> "
> The simplest handling might be to exit all ublk queues first, and re-create one
> new process to recover all since the request queue is required to be
> quiesced first, and all ublk queue is actually quiesced too. So from user
> viewpoint, there is nothing visible comparing with just recovering
> single ubq daemon/pthread.
> "
> 
> So I assume that quiesce_work starts only after all ubq_damons are dying.
> Note that current ublk does not support mutpile process opening the same chardev.
> 
> Really we should agree on this. My suggestion is that we only consider "all ubq_daemons
> are dying".
> 
> You mention that someone want to enable "switch ubq_daemon pthread to another one" and
> I think it is another feature but not recovery feature.
> 
> Regards,
> Zhang.

This should be considered very carefully, Ming.
diff mbox series

Patch

diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index b067f33a1913..4409a130d0b6 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -121,7 +121,7 @@  struct ublk_queue {
 
 	unsigned long io_addr;	/* mapped vm address */
 	unsigned int max_io_sz;
-	bool abort_work_pending;
+	bool force_abort;
 	unsigned short nr_io_ready;	/* how many ios setup */
 	struct ublk_device *dev;
 	struct ublk_io ios[0];
@@ -163,6 +163,7 @@  struct ublk_device {
 	 * monitor each queue's daemon periodically
 	 */
 	struct delayed_work	monitor_work;
+	struct work_struct	quiesce_work;
 	struct work_struct	stop_work;
 };
 
@@ -660,6 +661,11 @@  static void __ublk_fail_req(struct ublk_io *io, struct request *req)
 	WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE);
 
 	if (!(io->flags & UBLK_IO_FLAG_ABORTED)) {
+		pr_devel("%s: abort rq: qid %d tag %d io_flags %x\n",
+				__func__,
+				((struct ublk_queue *)req->mq_hctx->driver_data)->q_id,
+				req->tag,
+				io->flags);
 		io->flags |= UBLK_IO_FLAG_ABORTED;
 		blk_mq_end_request(req, BLK_STS_IOERR);
 	}
@@ -820,6 +826,21 @@  static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
 	res = ublk_setup_iod(ubq, rq);
 	if (unlikely(res != BLK_STS_OK))
 		return BLK_STS_IOERR;
+    /* With recovery feature enabled, force_abort is set in
+     * ublk_stop_dev() before calling del_gendisk() if ub's state
+     * is QUIESCED. We have to abort all requeued and new rqs here
+     * to let del_gendisk() move on. Besides, we do not call
+     * io_uring_cmd_complete_in_task() to avoid UAF on io_uring ctx.
+     *
+     * Note: force_abort is guaranteed to be seen because it is set
+     * before request queue is unqiuesced.
+     */
+	if (unlikely(ubq->force_abort)) {
+		pr_devel("%s: abort rq: qid %d tag %d io_flags %x\n",
+				__func__, ubq->q_id, rq->tag,
+				ubq->ios[rq->tag].flags);
+		return BLK_STS_IOERR;
+	}
 
 	blk_mq_start_request(bd->rq);
 
@@ -1003,6 +1024,101 @@  static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq)
 	ublk_put_device(ub);
 }
 
+static bool ublk_check_inflight_rq(struct request *rq, void *data)
+{
+	struct ublk_queue *ubq = rq->mq_hctx->driver_data;
+	struct ublk_io *io = &ubq->ios[rq->tag];
+	bool *busy = data;
+
+	if (io->flags & UBLK_IO_FLAG_ACTIVE) {
+		*busy = true;
+		return false;
+	}
+	return true;
+}
+
+static void ublk_wait_tagset_rqs_idle(struct ublk_device *ub)
+{
+	bool busy = false;
+
+	WARN_ON_ONCE(!blk_queue_quiesced(ub->ub_disk->queue));
+	while (true) {
+		blk_mq_tagset_busy_iter(&ub->tag_set,
+				ublk_check_inflight_rq, &busy);
+		if (busy)
+			msleep(UBLK_REQUEUE_DELAY_MS);
+		else
+			break;
+	}
+}
+
+static void ublk_quiesce_queue(struct ublk_device *ub,
+		struct ublk_queue *ubq)
+{
+	int i;
+
+	for (i = 0; i < ubq->q_depth; i++) {
+		struct ublk_io *io = &ubq->ios[i];
+
+		if (!(io->flags & UBLK_IO_FLAG_ACTIVE)) {
+			struct request *rq = blk_mq_tag_to_rq(
+					ub->tag_set.tags[ubq->q_id], i);
+
+			WARN_ON_ONCE(!rq);
+			pr_devel("%s: %s rq: qid %d tag %d io_flags %x\n", __func__,
+					ublk_queue_can_use_recovery_reissue(ubq) ?
+					"requeue" : "abort",
+					ubq->q_id, i, io->flags);
+			if (ublk_queue_can_use_recovery_reissue(ubq))
+				blk_mq_requeue_request(rq, false);
+			else
+				__ublk_fail_req(io, rq);
+		} else {
+			pr_devel("%s: done old cmd: qid %d tag %d\n",
+					__func__, ubq->q_id, i);
+			io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, 0);
+			io->flags &= ~UBLK_IO_FLAG_ACTIVE;
+		}
+		ubq->nr_io_ready--;
+	}
+	WARN_ON_ONCE(ubq->nr_io_ready);
+}
+
+static void ublk_quiesce_dev(struct ublk_device *ub)
+{
+	int i;
+
+	mutex_lock(&ub->mutex);
+	if (ub->dev_info.state != UBLK_S_DEV_LIVE)
+		goto unlock;
+
+	for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
+		struct ublk_queue *ubq = ublk_get_queue(ub, i);
+
+		if (!ubq_daemon_is_dying(ubq))
+			goto unlock;
+	}
+	blk_mq_quiesce_queue(ub->ub_disk->queue);
+	ublk_wait_tagset_rqs_idle(ub);
+	pr_devel("%s: quiesce ub: dev_id %d\n",
+			__func__, ub->dev_info.dev_id);
+
+	for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
+		ublk_quiesce_queue(ub, ublk_get_queue(ub, i));
+
+	ub->dev_info.state = UBLK_S_DEV_QUIESCED;
+ unlock:
+	mutex_unlock(&ub->mutex);
+}
+
+static void ublk_quiesce_work_fn(struct work_struct *work)
+{
+	struct ublk_device *ub =
+		container_of(work, struct ublk_device, quiesce_work);
+
+	ublk_quiesce_dev(ub);
+}
+
 static void ublk_daemon_monitor_work(struct work_struct *work)
 {
 	struct ublk_device *ub =
@@ -1013,10 +1129,14 @@  static void ublk_daemon_monitor_work(struct work_struct *work)
 		struct ublk_queue *ubq = ublk_get_queue(ub, i);
 
 		if (ubq_daemon_is_dying(ubq)) {
-			schedule_work(&ub->stop_work);
-
-			/* abort queue is for making forward progress */
-			ublk_abort_queue(ub, ubq);
+			if (ublk_queue_can_use_recovery(ubq)) {
+				schedule_work(&ub->quiesce_work);
+			} else {
+				schedule_work(&ub->stop_work);
+
+				/* abort queue is for making forward progress */
+				ublk_abort_queue(ub, ubq);
+			}
 		}
 	}
 
@@ -1080,12 +1200,43 @@  static void ublk_cancel_dev(struct ublk_device *ub)
 		ublk_cancel_queue(ublk_get_queue(ub, i));
 }
 
+static void ublk_unquiesce_dev(struct ublk_device *ub)
+{
+	int i;
+
+	pr_devel("%s: ub state %s\n", __func__,
+			ub->dev_info.state == UBLK_S_DEV_LIVE ?
+			"LIVE" : "QUIESCED");
+	if (ub->dev_info.state == UBLK_S_DEV_LIVE) {
+		/*
+		 * quiesce_work cannot be running. We let monitor_work,
+		 * ublk_queue_rq() and task_work abort rqs instead of
+		 * requeuing them with a dying ubq_daemon. Then
+		 * del_gendisk() can move on.
+		 */
+		ublk_disable_recovery(ub);
+	} else {
+		/* quiesce_work has run. We let requeued rqs be aborted
+		 * before running fallback_wq. "force_abort" must be seen
+		 * after request queue is unqiuesced. Then del_gendisk()
+		 * can move on.
+		 */
+		for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
+			ublk_get_queue(ub, i)->force_abort = true;
+
+		blk_mq_unquiesce_queue(ub->ub_disk->queue);
+		/* We may have requeued some rqs in ublk_quiesce_queue() */
+		blk_mq_kick_requeue_list(ub->ub_disk->queue);
+	}
+}
+
 static void ublk_stop_dev(struct ublk_device *ub)
 {
 	mutex_lock(&ub->mutex);
-	if (ub->dev_info.state != UBLK_S_DEV_LIVE)
+	if (ub->dev_info.state == UBLK_S_DEV_DEAD)
 		goto unlock;
-
+	if (ublk_can_use_recovery(ub))
+		ublk_unquiesce_dev(ub);
 	del_gendisk(ub->ub_disk);
 	ub->dev_info.state = UBLK_S_DEV_DEAD;
 	ub->dev_info.ublksrv_pid = -1;
@@ -1409,6 +1560,7 @@  static void ublk_remove(struct ublk_device *ub)
 {
 	ublk_stop_dev(ub);
 	cancel_work_sync(&ub->stop_work);
+	cancel_work_sync(&ub->quiesce_work);
 	cdev_device_del(&ub->cdev, &ub->cdev_dev);
 	put_device(&ub->cdev_dev);
 }
@@ -1585,6 +1737,7 @@  static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
 		goto out_unlock;
 	mutex_init(&ub->mutex);
 	spin_lock_init(&ub->mm_lock);
+	INIT_WORK(&ub->quiesce_work, ublk_quiesce_work_fn);
 	INIT_WORK(&ub->stop_work, ublk_stop_work_fn);
 	INIT_DELAYED_WORK(&ub->monitor_work, ublk_daemon_monitor_work);
 
@@ -1705,6 +1858,7 @@  static int ublk_ctrl_stop_dev(struct io_uring_cmd *cmd)
 
 	ublk_stop_dev(ub);
 	cancel_work_sync(&ub->stop_work);
+	cancel_work_sync(&ub->quiesce_work);
 
 	ublk_put_device(ub);
 	return 0;