Message ID | 20171204180624.18722-2-bart.vanassche@wdc.com (mailing list archive) |
---|---|
State | Accepted |
Headers | show |
On 12/04/2017 09:06 PM, Bart Van Assche wrote: > If scsi_eh_scmd_add() is called concurrently with > scsi_host_queue_ready() while shost->host_blocked > 0 then it can > happen that neither function wakes up the SCSI error handler. Fix > this by making every function that decreases the host_busy counter > wake up the error handler if necessary and by protecting the > host_failed checks with the SCSI host lock. > > Reported-by: Pavel Tikhomirov <ptikhomirov@virtuozzo.com> > References: https://marc.info/?l=linux-kernel&m=150461610630736 > Fixes: commit 746650160866 ("scsi: convert host_busy to atomic_t") > Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com> > Cc: Konstantin Khorenko <khorenko@virtuozzo.com> > Cc: Stuart Hayes <stuart.w.hayes@gmail.com> > Cc: Pavel Tikhomirov <ptikhomirov@virtuozzo.com> > Cc: Christoph Hellwig <hch@lst.de> > Cc: Hannes Reinecke <hare@suse.com> > Cc: Johannes Thumshirn <jthumshirn@suse.de> > Cc: <stable@vger.kernel.org> > --- > drivers/scsi/hosts.c | 6 ++++++ > drivers/scsi/scsi_error.c | 18 ++++++++++++++++-- > drivers/scsi/scsi_lib.c | 39 ++++++++++++++++++++++++++++----------- > include/scsi/scsi_host.h | 2 ++ > 4 files changed, 52 insertions(+), 13 deletions(-) > > diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c > index a306af6a5ea7..a0a7e4ff255c 100644 > --- a/drivers/scsi/hosts.c > +++ b/drivers/scsi/hosts.c > @@ -324,6 +324,9 @@ static void scsi_host_dev_release(struct device *dev) > > scsi_proc_hostdir_rm(shost->hostt); > > + /* Wait for functions invoked through call_rcu(&shost->rcu, ...) */ > + rcu_barrier(); > + > if (shost->tmf_work_q) > destroy_workqueue(shost->tmf_work_q); > if (shost->ehandler) > @@ -331,6 +334,8 @@ static void scsi_host_dev_release(struct device *dev) > if (shost->work_q) > destroy_workqueue(shost->work_q); > > + destroy_rcu_head(&shost->rcu); > + > if (shost->shost_state == SHOST_CREATED) { > /* > * Free the shost_dev device name here if scsi_host_alloc() > @@ -399,6 +404,7 @@ struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *sht, int privsize) > INIT_LIST_HEAD(&shost->starved_list); > init_waitqueue_head(&shost->host_wait); > mutex_init(&shost->scan_mutex); > + init_rcu_head(&shost->rcu); > > index = ida_simple_get(&host_index_ida, 0, 0, GFP_KERNEL); > if (index < 0) > diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c > index 5e89049e9b4e..258b8a741992 100644 > --- a/drivers/scsi/scsi_error.c > +++ b/drivers/scsi/scsi_error.c > @@ -226,6 +226,17 @@ static void scsi_eh_reset(struct scsi_cmnd *scmd) > } > } > > +static void scsi_eh_inc_host_failed(struct rcu_head *head) > +{ > + struct Scsi_Host *shost = container_of(head, typeof(*shost), rcu); > + unsigned long flags; > + > + spin_lock_irqsave(shost->host_lock, flags); > + shost->host_failed++; May be we need to increment host_failed before call_rcu(), so that all rcu protected readers already see a change at these point? > + scsi_eh_wakeup(shost); > + spin_unlock_irqrestore(shost->host_lock, flags); > +} > + > /** > * scsi_eh_scmd_add - add scsi cmd to error handling. > * @scmd: scmd to run eh on. > @@ -248,9 +259,12 @@ void scsi_eh_scmd_add(struct scsi_cmnd *scmd) > > scsi_eh_reset(scmd); > list_add_tail(&scmd->eh_entry, &shost->eh_cmd_q); > - shost->host_failed++; > - scsi_eh_wakeup(shost); > spin_unlock_irqrestore(shost->host_lock, flags); > + /* > + * Ensure that all tasks observe the host state change before the > + * host_failed change. > + */ > + call_rcu(&shost->rcu, scsi_eh_inc_host_failed); > } > > /** > diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c > index b6d3842b6809..5cbc69b2b1ae 100644 > --- a/drivers/scsi/scsi_lib.c > +++ b/drivers/scsi/scsi_lib.c > @@ -318,22 +318,39 @@ static void scsi_init_cmd_errh(struct scsi_cmnd *cmd) > cmd->cmd_len = scsi_command_size(cmd->cmnd); > } > > -void scsi_device_unbusy(struct scsi_device *sdev) > +/* > + * Decrement the host_busy counter and wake up the error handler if necessary. > + * Avoid as follows that the error handler is not woken up if shost->host_busy > + * == shost->host_failed: use call_rcu() in scsi_eh_scmd_add() in combination > + * with an RCU read lock in this function to ensure that this function in its > + * entirety either finishes before scsi_eh_scmd_add() increases the > + * host_failed counter or that it notices the shost state change made by > + * scsi_eh_scmd_add(). > + */ > +static void scsi_dec_host_busy(struct Scsi_Host *shost) > { > - struct Scsi_Host *shost = sdev->host; > - struct scsi_target *starget = scsi_target(sdev); > unsigned long flags; > > + rcu_read_lock(); > atomic_dec(&shost->host_busy); > - if (starget->can_queue > 0) > - atomic_dec(&starget->target_busy); > - > - if (unlikely(scsi_host_in_recovery(shost) && > - (shost->host_failed || shost->host_eh_scheduled))) { > + if (unlikely(scsi_host_in_recovery(shost))) { > spin_lock_irqsave(shost->host_lock, flags); > - scsi_eh_wakeup(shost); > + if (shost->host_failed || shost->host_eh_scheduled) > + scsi_eh_wakeup(shost); > spin_unlock_irqrestore(shost->host_lock, flags); > } > + rcu_read_unlock(); > +} > + > +void scsi_device_unbusy(struct scsi_device *sdev) > +{ > + struct Scsi_Host *shost = sdev->host; > + struct scsi_target *starget = scsi_target(sdev); > + > + scsi_dec_host_busy(shost); > + > + if (starget->can_queue > 0) > + atomic_dec(&starget->target_busy); > > atomic_dec(&sdev->device_busy); > } > @@ -1531,7 +1548,7 @@ static inline int scsi_host_queue_ready(struct request_queue *q, > list_add_tail(&sdev->starved_entry, &shost->starved_list); > spin_unlock_irq(shost->host_lock); > out_dec: > - atomic_dec(&shost->host_busy); > + scsi_dec_host_busy(shost); > return 0; > } > > @@ -2017,7 +2034,7 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx, > return BLK_STS_OK; > > out_dec_host_busy: > - atomic_dec(&shost->host_busy); > + scsi_dec_host_busy(shost); > out_dec_target_busy: > if (scsi_target(sdev)->can_queue > 0) > atomic_dec(&scsi_target(sdev)->target_busy); > diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h > index a8b7bf879ced..1a1df0d21ee3 100644 > --- a/include/scsi/scsi_host.h > +++ b/include/scsi/scsi_host.h > @@ -571,6 +571,8 @@ struct Scsi_Host { > struct blk_mq_tag_set tag_set; > }; > > + struct rcu_head rcu; > + > atomic_t host_busy; /* commands actually active on low-level */ > atomic_t host_blocked; > >
On Tue, 2017-12-05 at 13:18 +0300, Pavel Tikhomirov wrote: > On 12/04/2017 09:06 PM, Bart Van Assche wrote: > > +static void scsi_eh_inc_host_failed(struct rcu_head *head) > > +{ > > + struct Scsi_Host *shost = container_of(head, typeof(*shost), rcu); > > + unsigned long flags; > > + > > + spin_lock_irqsave(shost->host_lock, flags); > > + shost->host_failed++; > > May be we need to increment host_failed before call_rcu(), so that all > rcu protected readers already see a change at these point? Sorry but I don't think that would work. If host_failed would be changed first then it can happen that scsi_dec_host_busy() encounters that the host state is "running" and hence that it skips the host_failed check. That can result in a missed error handler wakeup, which is what we want to avoid. Bart.
On Wed, 2017-12-06 at 00:17 +0300, ptikhomirov wrote: > I mean threads in scsi_dec_host_busy() the part under rcu_read_lock are > divided into two groups: a) finished before call_rcu, b) beginning rcu > section after call_rcu. So first, in scsi_eh_inc_host_failed() we will > see all changes to host busy from group (a), second, all threads in group > (b) will see our change to host_failed. Either there is nobody in (b) and > we will start EH, or the thread from (b) which entered spin_lock last will > start EH. > > In your case tasks from b does not see host_failed was incremented, and > will not start EH. Hello Pavel, What does "your case" mean? In my previous e-mail I explained a scenario that cannot happen so it's not clear to me what "your case" refers to? Additionally, it seems like you are assuming that RCU guarantees ordering of RCU read-locked sections against call_rcu()? That's not how RCU works. RCU guarantees serialization of read-locked sections against grace periods. The function scsi_eh_inc_host_failed() is invoked through call_rcu() and hence will be called during a grace period. Anyway, the different scenarios I see are as follows: (a) scsi_dec_host_busy() finishes before scsi_eh_inc_host_failed() starts. (b) scsi_dec_host_busy() starts after scsi_eh_inc_host_failed() has finished. In case (a) scsi_eh_inc_host_failed() will wake up the error handler. And in case (b) scsi_dec_host_busy() will wake up the error handler. So it's not clear to me why you think that there is a scenario in which the EH won't be woken up? Bart.
Hello, Bart! On 12/06/2017 12:46 AM, Bart Van Assche wrote: > On Wed, 2017-12-06 at 00:17 +0300, ptikhomirov wrote: >> I mean threads in scsi_dec_host_busy() the part under rcu_read_lock are >> divided into two groups: a) finished before call_rcu, b) beginning rcu >> section after call_rcu. So first, in scsi_eh_inc_host_failed() we will >> see all changes to host busy from group (a), second, all threads in group >> (b) will see our change to host_failed. Either there is nobody in (b) and >> we will start EH, or the thread from (b) which entered spin_lock last will >> start EH. >> >> In your case tasks from b does not see host_failed was incremented, and >> will not start EH. > > Hello Pavel, > > What does "your case" mean? In my previous e-mail I explained a scenario that > cannot happen so it's not clear to me what "your case" refers to? By "your case" I meant how your code works, especially that host_failed increment is inside scsi_eh_inc_host_failed() in your patch. > > Additionally, it seems like you are assuming that RCU guarantees ordering of > RCU read-locked sections against call_rcu()? Sorry.. Not "call_rcu" itself, In my previous reply I meant the delayed callback which actually triggers. (s/call_rcu/call_rcu's callback start/) > That's not how RCU works. RCU > guarantees serialization of read-locked sections against grace periods. The > function scsi_eh_inc_host_failed() is invoked through call_rcu() and hence > will be called during a grace period. May be I missunderstand something, but I think that callback from call_rcu is guaranteed to _start_ after a grace period, but not to end before a next grace period. Other threads can enter rcu_read_lock protected critical regions while we are still in a callback and will run concurrently with callback. > > Anyway, the different scenarios I see are as follows: > (a) scsi_dec_host_busy() finishes before scsi_eh_inc_host_failed() starts. > (b) scsi_dec_host_busy() starts after scsi_eh_inc_host_failed() has > finished. So I think in (b) scsi_dec_host_busy starts after scsi_eh_inc_host_failed has _started_. > > In case (a) scsi_eh_inc_host_failed() will wake up the error handler. And in > case (b) scsi_dec_host_busy() will wake up the error handler. So it's not > clear to me why you think that there is a scenario in which the EH won't be > woken up? So in case (b), in my understanding, scsi_dec_host_busy can skip wakeups as it does not see host_failed change yet. > > Bart. > To prove my point, some parts of kernel docs: "This function invokes func(head) after a grace period has elapsed." Documentation/RCU/whatisRCU.txt " 15. The whole point of call_rcu(), synchronize_rcu(), and friends is to wait until all pre-existing readers have finished before carrying out some otherwise-destructive operation. ... Because these primitives only wait for pre-existing readers, it is the caller's responsibility to guarantee that any subsequent readers will execute safely. " Documentation/RCU/checklist.txt There is nothing about "callback ends before next grace period". Sorry, if I'm missing something. Pavel.
On Wed, 2017-12-06 at 01:49 +0300, Pavel Tikhomirov wrote: > On 12/06/2017 12:46 AM, Bart Van Assche wrote: > > Anyway, the different scenarios I see are as follows: > > (a) scsi_dec_host_busy() finishes before scsi_eh_inc_host_failed() starts. > > (b) scsi_dec_host_busy() starts after scsi_eh_inc_host_failed() has > > finished. > > So I think in (b) scsi_dec_host_busy starts after scsi_eh_inc_host_failed > has _started_. Agreed, and that's fine, since in that case the SCSI host state has alread been modified and hence both functions will obtain the SCSI host lock. The relevant code will be serialized through the SCSI host lock. > > In case (a) scsi_eh_inc_host_failed() will wake up the error handler. And in > > case (b) scsi_dec_host_busy() will wake up the error handler. So it's not > > clear to me why you think that there is a scenario in which the EH won't be > > woken up? > > So in case (b), in my understanding, scsi_dec_host_busy can skip wakeups > as it does not see host_failed change yet. That's not correct. If scsi_dec_host_busy() obtains the SCSI host lock before scsi_eh_inc_host_failed() obtains it then the latter function will trigger a SCSI EH wakeup. Bart.
>>> In case (a) scsi_eh_inc_host_failed() will wake up the error handler. And in >>> case (b) scsi_dec_host_busy() will wake up the error handler. So it's not >>> clear to me why you think that there is a scenario in which the EH won't be >>> woken up? >> >> So in case (b), in my understanding, scsi_dec_host_busy can skip wakeups >> as it does not see host_failed change yet. > > That's not correct. If scsi_dec_host_busy() obtains the SCSI host lock before > scsi_eh_inc_host_failed() obtains it then the latter function will trigger a > SCSI EH wakeup. You are right! Thanks a lot for pointing that out! Now when I understand it, your patch looks good for me: Reviewed-by: Pavel Tikhomirov <ptikhomirov@virtuozzo.com> By the way, I very much like your idea of using rcu for these case. Thanks, Pavel. > > Bart. >
>>>> In case (a) scsi_eh_inc_host_failed() will wake up the error handler. And in >>>> case (b) scsi_dec_host_busy() will wake up the error handler. So it's not >>>> clear to me why you think that there is a scenario in which the EH won't be >>>> woken up? >>> >>> So in case (b), in my understanding, scsi_dec_host_busy can skip wakeups >>> as it does not see host_failed change yet. >> >> That's not correct. If scsi_dec_host_busy() obtains the SCSI host lock before >> scsi_eh_inc_host_failed() obtains it then the latter function will trigger a >> SCSI EH wakeup. > > You are right! Thanks a lot for pointing that out! Now when I understand it, your patch looks good for me: > > Reviewed-by: Pavel Tikhomirov <ptikhomirov@virtuozzo.com> > > By the way, I very much like your idea of using rcu for these case. > > Thanks, Pavel. > This patch tests ok on my system, too... it's run for over 24 hours, on a system that typically fails within ten minutes without the patch... Tested-by: Stuart Hayes <stuart.w.hayes@gmail.com> Thanks, Stuart --- This email has been checked for viruses by Avast antivirus software. https://www.avast.com/antivirus
diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c index a306af6a5ea7..a0a7e4ff255c 100644 --- a/drivers/scsi/hosts.c +++ b/drivers/scsi/hosts.c @@ -324,6 +324,9 @@ static void scsi_host_dev_release(struct device *dev) scsi_proc_hostdir_rm(shost->hostt); + /* Wait for functions invoked through call_rcu(&shost->rcu, ...) */ + rcu_barrier(); + if (shost->tmf_work_q) destroy_workqueue(shost->tmf_work_q); if (shost->ehandler) @@ -331,6 +334,8 @@ static void scsi_host_dev_release(struct device *dev) if (shost->work_q) destroy_workqueue(shost->work_q); + destroy_rcu_head(&shost->rcu); + if (shost->shost_state == SHOST_CREATED) { /* * Free the shost_dev device name here if scsi_host_alloc() @@ -399,6 +404,7 @@ struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *sht, int privsize) INIT_LIST_HEAD(&shost->starved_list); init_waitqueue_head(&shost->host_wait); mutex_init(&shost->scan_mutex); + init_rcu_head(&shost->rcu); index = ida_simple_get(&host_index_ida, 0, 0, GFP_KERNEL); if (index < 0) diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c index 5e89049e9b4e..258b8a741992 100644 --- a/drivers/scsi/scsi_error.c +++ b/drivers/scsi/scsi_error.c @@ -226,6 +226,17 @@ static void scsi_eh_reset(struct scsi_cmnd *scmd) } } +static void scsi_eh_inc_host_failed(struct rcu_head *head) +{ + struct Scsi_Host *shost = container_of(head, typeof(*shost), rcu); + unsigned long flags; + + spin_lock_irqsave(shost->host_lock, flags); + shost->host_failed++; + scsi_eh_wakeup(shost); + spin_unlock_irqrestore(shost->host_lock, flags); +} + /** * scsi_eh_scmd_add - add scsi cmd to error handling. * @scmd: scmd to run eh on. @@ -248,9 +259,12 @@ void scsi_eh_scmd_add(struct scsi_cmnd *scmd) scsi_eh_reset(scmd); list_add_tail(&scmd->eh_entry, &shost->eh_cmd_q); - shost->host_failed++; - scsi_eh_wakeup(shost); spin_unlock_irqrestore(shost->host_lock, flags); + /* + * Ensure that all tasks observe the host state change before the + * host_failed change. + */ + call_rcu(&shost->rcu, scsi_eh_inc_host_failed); } /** diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index b6d3842b6809..5cbc69b2b1ae 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -318,22 +318,39 @@ static void scsi_init_cmd_errh(struct scsi_cmnd *cmd) cmd->cmd_len = scsi_command_size(cmd->cmnd); } -void scsi_device_unbusy(struct scsi_device *sdev) +/* + * Decrement the host_busy counter and wake up the error handler if necessary. + * Avoid as follows that the error handler is not woken up if shost->host_busy + * == shost->host_failed: use call_rcu() in scsi_eh_scmd_add() in combination + * with an RCU read lock in this function to ensure that this function in its + * entirety either finishes before scsi_eh_scmd_add() increases the + * host_failed counter or that it notices the shost state change made by + * scsi_eh_scmd_add(). + */ +static void scsi_dec_host_busy(struct Scsi_Host *shost) { - struct Scsi_Host *shost = sdev->host; - struct scsi_target *starget = scsi_target(sdev); unsigned long flags; + rcu_read_lock(); atomic_dec(&shost->host_busy); - if (starget->can_queue > 0) - atomic_dec(&starget->target_busy); - - if (unlikely(scsi_host_in_recovery(shost) && - (shost->host_failed || shost->host_eh_scheduled))) { + if (unlikely(scsi_host_in_recovery(shost))) { spin_lock_irqsave(shost->host_lock, flags); - scsi_eh_wakeup(shost); + if (shost->host_failed || shost->host_eh_scheduled) + scsi_eh_wakeup(shost); spin_unlock_irqrestore(shost->host_lock, flags); } + rcu_read_unlock(); +} + +void scsi_device_unbusy(struct scsi_device *sdev) +{ + struct Scsi_Host *shost = sdev->host; + struct scsi_target *starget = scsi_target(sdev); + + scsi_dec_host_busy(shost); + + if (starget->can_queue > 0) + atomic_dec(&starget->target_busy); atomic_dec(&sdev->device_busy); } @@ -1531,7 +1548,7 @@ static inline int scsi_host_queue_ready(struct request_queue *q, list_add_tail(&sdev->starved_entry, &shost->starved_list); spin_unlock_irq(shost->host_lock); out_dec: - atomic_dec(&shost->host_busy); + scsi_dec_host_busy(shost); return 0; } @@ -2017,7 +2034,7 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx, return BLK_STS_OK; out_dec_host_busy: - atomic_dec(&shost->host_busy); + scsi_dec_host_busy(shost); out_dec_target_busy: if (scsi_target(sdev)->can_queue > 0) atomic_dec(&scsi_target(sdev)->target_busy); diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h index a8b7bf879ced..1a1df0d21ee3 100644 --- a/include/scsi/scsi_host.h +++ b/include/scsi/scsi_host.h @@ -571,6 +571,8 @@ struct Scsi_Host { struct blk_mq_tag_set tag_set; }; + struct rcu_head rcu; + atomic_t host_busy; /* commands actually active on low-level */ atomic_t host_blocked;
If scsi_eh_scmd_add() is called concurrently with scsi_host_queue_ready() while shost->host_blocked > 0 then it can happen that neither function wakes up the SCSI error handler. Fix this by making every function that decreases the host_busy counter wake up the error handler if necessary and by protecting the host_failed checks with the SCSI host lock. Reported-by: Pavel Tikhomirov <ptikhomirov@virtuozzo.com> References: https://marc.info/?l=linux-kernel&m=150461610630736 Fixes: commit 746650160866 ("scsi: convert host_busy to atomic_t") Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com> Cc: Konstantin Khorenko <khorenko@virtuozzo.com> Cc: Stuart Hayes <stuart.w.hayes@gmail.com> Cc: Pavel Tikhomirov <ptikhomirov@virtuozzo.com> Cc: Christoph Hellwig <hch@lst.de> Cc: Hannes Reinecke <hare@suse.com> Cc: Johannes Thumshirn <jthumshirn@suse.de> Cc: <stable@vger.kernel.org> --- drivers/scsi/hosts.c | 6 ++++++ drivers/scsi/scsi_error.c | 18 ++++++++++++++++-- drivers/scsi/scsi_lib.c | 39 ++++++++++++++++++++++++++++----------- include/scsi/scsi_host.h | 2 ++ 4 files changed, 52 insertions(+), 13 deletions(-)