diff mbox series

[v3] ceph: defer stopping the mdsc delayed_work

Message ID 20230725040359.363444-1-xiubli@redhat.com (mailing list archive)
State New, archived
Headers show
Series [v3] ceph: defer stopping the mdsc delayed_work | expand

Commit Message

Xiubo Li July 25, 2023, 4:03 a.m. UTC
From: Xiubo Li <xiubli@redhat.com>

Flushing the dirty buffer may take a long time if the Rados is
overloaded or if there is network issue. So we should ping the
MDSs periodically to keep alive, else the MDS will blocklist
the kclient.

Cc: stable@vger.kernel.org
Cc: Venky Shankar <vshankar@redhat.com>
URL: https://tracker.ceph.com/issues/61843
Reviewed-by: Milind Changire <mchangir@redhat.com>
Signed-off-by: Xiubo Li <xiubli@redhat.com>
---

V3:
- Rebased to the master branch


 fs/ceph/mds_client.c |  4 ++--
 fs/ceph/mds_client.h |  5 +++++
 fs/ceph/super.c      | 10 ++++++++++
 3 files changed, 17 insertions(+), 2 deletions(-)

Comments

Milind Changire July 31, 2023, 11:47 a.m. UTC | #1
On Tue, Jul 25, 2023 at 9:36 AM <xiubli@redhat.com> wrote:
>
> From: Xiubo Li <xiubli@redhat.com>
>
> Flushing the dirty buffer may take a long time if the Rados is
> overloaded or if there is network issue. So we should ping the
> MDSs periodically to keep alive, else the MDS will blocklist
> the kclient.
>
> Cc: stable@vger.kernel.org
> Cc: Venky Shankar <vshankar@redhat.com>
> URL: https://tracker.ceph.com/issues/61843
> Reviewed-by: Milind Changire <mchangir@redhat.com>
> Signed-off-by: Xiubo Li <xiubli@redhat.com>
> ---
>
> V3:
> - Rebased to the master branch
>
>
>  fs/ceph/mds_client.c |  4 ++--
>  fs/ceph/mds_client.h |  5 +++++
>  fs/ceph/super.c      | 10 ++++++++++
>  3 files changed, 17 insertions(+), 2 deletions(-)
>
> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> index 66048a86c480..5fb367b1d4b0 100644
> --- a/fs/ceph/mds_client.c
> +++ b/fs/ceph/mds_client.c
> @@ -4764,7 +4764,7 @@ static void delayed_work(struct work_struct *work)
>
>         dout("mdsc delayed_work\n");
>
> -       if (mdsc->stopping)
> +       if (mdsc->stopping >= CEPH_MDSC_STOPPING_FLUSHED)
>                 return;

Do we want to continue to accept/perform delayed work when
mdsc->stopping is set to CEPH_MDSC_STOPPING_BEGIN ?

I thought setting the STOPPING_BEGIN flag would immediately bar any
further new activity and STOPPING_FLUSHED would mark safe deletion of
the superblock.



>
>         mutex_lock(&mdsc->mutex);
> @@ -4943,7 +4943,7 @@ void send_flush_mdlog(struct ceph_mds_session *s)
>  void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
>  {
>         dout("pre_umount\n");
> -       mdsc->stopping = 1;
> +       mdsc->stopping = CEPH_MDSC_STOPPING_BEGIN;
>
>         ceph_mdsc_iterate_sessions(mdsc, send_flush_mdlog, true);
>         ceph_mdsc_iterate_sessions(mdsc, lock_unlock_session, false);
> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> index 724307ff89cd..86d2965e68a1 100644
> --- a/fs/ceph/mds_client.h
> +++ b/fs/ceph/mds_client.h
> @@ -380,6 +380,11 @@ struct cap_wait {
>         int                     want;
>  };
>
> +enum {
> +       CEPH_MDSC_STOPPING_BEGIN = 1,
> +       CEPH_MDSC_STOPPING_FLUSHED = 2,
> +};
> +
>  /*
>   * mds client state
>   */
> diff --git a/fs/ceph/super.c b/fs/ceph/super.c
> index 3fc48b43cab0..a5f52013314d 100644
> --- a/fs/ceph/super.c
> +++ b/fs/ceph/super.c
> @@ -1374,6 +1374,16 @@ static void ceph_kill_sb(struct super_block *s)
>         ceph_mdsc_pre_umount(fsc->mdsc);
>         flush_fs_workqueues(fsc);
>
> +       /*
> +        * Though the kill_anon_super() will finally trigger the
> +        * sync_filesystem() anyway, we still need to do it here
> +        * and then bump the stage of shutdown to stop the work
> +        * queue as earlier as possible.
> +        */
> +       sync_filesystem(s);
> +
> +       fsc->mdsc->stopping = CEPH_MDSC_STOPPING_FLUSHED;
> +
>         kill_anon_super(s);
>
>         fsc->client->extra_mon_dispatch = NULL;
> --
> 2.39.1
>
Xiubo Li July 31, 2023, 11:59 a.m. UTC | #2
On 7/31/23 19:47, Milind Changire wrote:
> On Tue, Jul 25, 2023 at 9:36 AM <xiubli@redhat.com> wrote:
>> From: Xiubo Li <xiubli@redhat.com>
>>
>> Flushing the dirty buffer may take a long time if the Rados is
>> overloaded or if there is network issue. So we should ping the
>> MDSs periodically to keep alive, else the MDS will blocklist
>> the kclient.
>>
>> Cc: stable@vger.kernel.org
>> Cc: Venky Shankar <vshankar@redhat.com>
>> URL: https://tracker.ceph.com/issues/61843
>> Reviewed-by: Milind Changire <mchangir@redhat.com>
>> Signed-off-by: Xiubo Li <xiubli@redhat.com>
>> ---
>>
>> V3:
>> - Rebased to the master branch
>>
>>
>>   fs/ceph/mds_client.c |  4 ++--
>>   fs/ceph/mds_client.h |  5 +++++
>>   fs/ceph/super.c      | 10 ++++++++++
>>   3 files changed, 17 insertions(+), 2 deletions(-)
>>
>> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
>> index 66048a86c480..5fb367b1d4b0 100644
>> --- a/fs/ceph/mds_client.c
>> +++ b/fs/ceph/mds_client.c
>> @@ -4764,7 +4764,7 @@ static void delayed_work(struct work_struct *work)
>>
>>          dout("mdsc delayed_work\n");
>>
>> -       if (mdsc->stopping)
>> +       if (mdsc->stopping >= CEPH_MDSC_STOPPING_FLUSHED)
>>                  return;
> Do we want to continue to accept/perform delayed work when
> mdsc->stopping is set to CEPH_MDSC_STOPPING_BEGIN ?
>
> I thought setting the STOPPING_BEGIN flag would immediately bar any
> further new activity and STOPPING_FLUSHED would mark safe deletion of
> the superblock.

Yes,  we need.

Locally I can reproduce this very easy with fsstress.sh script, please 
see https://tracker.ceph.com/issues/61843#note-1.

That's because when umounting and flushing the dirty buffer it could be 
blocked by the Rados dues to the lower disk space or MM reasons. And 
during this we need to ping MDS to keep alive to make sure the MDS won't 
evict us before we close the sessions later.

Thanks

- Xiubo

>
>
>>          mutex_lock(&mdsc->mutex);
>> @@ -4943,7 +4943,7 @@ void send_flush_mdlog(struct ceph_mds_session *s)
>>   void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
>>   {
>>          dout("pre_umount\n");
>> -       mdsc->stopping = 1;
>> +       mdsc->stopping = CEPH_MDSC_STOPPING_BEGIN;
>>
>>          ceph_mdsc_iterate_sessions(mdsc, send_flush_mdlog, true);
>>          ceph_mdsc_iterate_sessions(mdsc, lock_unlock_session, false);
>> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
>> index 724307ff89cd..86d2965e68a1 100644
>> --- a/fs/ceph/mds_client.h
>> +++ b/fs/ceph/mds_client.h
>> @@ -380,6 +380,11 @@ struct cap_wait {
>>          int                     want;
>>   };
>>
>> +enum {
>> +       CEPH_MDSC_STOPPING_BEGIN = 1,
>> +       CEPH_MDSC_STOPPING_FLUSHED = 2,
>> +};
>> +
>>   /*
>>    * mds client state
>>    */
>> diff --git a/fs/ceph/super.c b/fs/ceph/super.c
>> index 3fc48b43cab0..a5f52013314d 100644
>> --- a/fs/ceph/super.c
>> +++ b/fs/ceph/super.c
>> @@ -1374,6 +1374,16 @@ static void ceph_kill_sb(struct super_block *s)
>>          ceph_mdsc_pre_umount(fsc->mdsc);
>>          flush_fs_workqueues(fsc);
>>
>> +       /*
>> +        * Though the kill_anon_super() will finally trigger the
>> +        * sync_filesystem() anyway, we still need to do it here
>> +        * and then bump the stage of shutdown to stop the work
>> +        * queue as earlier as possible.
>> +        */
>> +       sync_filesystem(s);
>> +
>> +       fsc->mdsc->stopping = CEPH_MDSC_STOPPING_FLUSHED;
>> +
>>          kill_anon_super(s);
>>
>>          fsc->client->extra_mon_dispatch = NULL;
>> --
>> 2.39.1
>>
>
Milind Changire Aug. 1, 2023, 8:33 a.m. UTC | #3
Looks good to me.

Reviewed-by: Milind Changire <mchangir@redhat.com>

On Mon, Jul 31, 2023 at 5:29 PM Xiubo Li <xiubli@redhat.com> wrote:
>
>
> On 7/31/23 19:47, Milind Changire wrote:
> > On Tue, Jul 25, 2023 at 9:36 AM <xiubli@redhat.com> wrote:
> >> From: Xiubo Li <xiubli@redhat.com>
> >>
> >> Flushing the dirty buffer may take a long time if the Rados is
> >> overloaded or if there is network issue. So we should ping the
> >> MDSs periodically to keep alive, else the MDS will blocklist
> >> the kclient.
> >>
> >> Cc: stable@vger.kernel.org
> >> Cc: Venky Shankar <vshankar@redhat.com>
> >> URL: https://tracker.ceph.com/issues/61843
> >> Reviewed-by: Milind Changire <mchangir@redhat.com>
> >> Signed-off-by: Xiubo Li <xiubli@redhat.com>
> >> ---
> >>
> >> V3:
> >> - Rebased to the master branch
> >>
> >>
> >>   fs/ceph/mds_client.c |  4 ++--
> >>   fs/ceph/mds_client.h |  5 +++++
> >>   fs/ceph/super.c      | 10 ++++++++++
> >>   3 files changed, 17 insertions(+), 2 deletions(-)
> >>
> >> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> >> index 66048a86c480..5fb367b1d4b0 100644
> >> --- a/fs/ceph/mds_client.c
> >> +++ b/fs/ceph/mds_client.c
> >> @@ -4764,7 +4764,7 @@ static void delayed_work(struct work_struct *work)
> >>
> >>          dout("mdsc delayed_work\n");
> >>
> >> -       if (mdsc->stopping)
> >> +       if (mdsc->stopping >= CEPH_MDSC_STOPPING_FLUSHED)
> >>                  return;
> > Do we want to continue to accept/perform delayed work when
> > mdsc->stopping is set to CEPH_MDSC_STOPPING_BEGIN ?
> >
> > I thought setting the STOPPING_BEGIN flag would immediately bar any
> > further new activity and STOPPING_FLUSHED would mark safe deletion of
> > the superblock.
>
> Yes,  we need.
>
> Locally I can reproduce this very easy with fsstress.sh script, please
> see https://tracker.ceph.com/issues/61843#note-1.
>
> That's because when umounting and flushing the dirty buffer it could be
> blocked by the Rados dues to the lower disk space or MM reasons. And
> during this we need to ping MDS to keep alive to make sure the MDS won't
> evict us before we close the sessions later.
>
> Thanks
>
> - Xiubo
>
> >
> >
> >>          mutex_lock(&mdsc->mutex);
> >> @@ -4943,7 +4943,7 @@ void send_flush_mdlog(struct ceph_mds_session *s)
> >>   void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
> >>   {
> >>          dout("pre_umount\n");
> >> -       mdsc->stopping = 1;
> >> +       mdsc->stopping = CEPH_MDSC_STOPPING_BEGIN;
> >>
> >>          ceph_mdsc_iterate_sessions(mdsc, send_flush_mdlog, true);
> >>          ceph_mdsc_iterate_sessions(mdsc, lock_unlock_session, false);
> >> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> >> index 724307ff89cd..86d2965e68a1 100644
> >> --- a/fs/ceph/mds_client.h
> >> +++ b/fs/ceph/mds_client.h
> >> @@ -380,6 +380,11 @@ struct cap_wait {
> >>          int                     want;
> >>   };
> >>
> >> +enum {
> >> +       CEPH_MDSC_STOPPING_BEGIN = 1,
> >> +       CEPH_MDSC_STOPPING_FLUSHED = 2,
> >> +};
> >> +
> >>   /*
> >>    * mds client state
> >>    */
> >> diff --git a/fs/ceph/super.c b/fs/ceph/super.c
> >> index 3fc48b43cab0..a5f52013314d 100644
> >> --- a/fs/ceph/super.c
> >> +++ b/fs/ceph/super.c
> >> @@ -1374,6 +1374,16 @@ static void ceph_kill_sb(struct super_block *s)
> >>          ceph_mdsc_pre_umount(fsc->mdsc);
> >>          flush_fs_workqueues(fsc);
> >>
> >> +       /*
> >> +        * Though the kill_anon_super() will finally trigger the
> >> +        * sync_filesystem() anyway, we still need to do it here
> >> +        * and then bump the stage of shutdown to stop the work
> >> +        * queue as earlier as possible.
> >> +        */
> >> +       sync_filesystem(s);
> >> +
> >> +       fsc->mdsc->stopping = CEPH_MDSC_STOPPING_FLUSHED;
> >> +
> >>          kill_anon_super(s);
> >>
> >>          fsc->client->extra_mon_dispatch = NULL;
> >> --
> >> 2.39.1
> >>
> >
>
diff mbox series

Patch

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 66048a86c480..5fb367b1d4b0 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -4764,7 +4764,7 @@  static void delayed_work(struct work_struct *work)
 
 	dout("mdsc delayed_work\n");
 
-	if (mdsc->stopping)
+	if (mdsc->stopping >= CEPH_MDSC_STOPPING_FLUSHED)
 		return;
 
 	mutex_lock(&mdsc->mutex);
@@ -4943,7 +4943,7 @@  void send_flush_mdlog(struct ceph_mds_session *s)
 void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
 {
 	dout("pre_umount\n");
-	mdsc->stopping = 1;
+	mdsc->stopping = CEPH_MDSC_STOPPING_BEGIN;
 
 	ceph_mdsc_iterate_sessions(mdsc, send_flush_mdlog, true);
 	ceph_mdsc_iterate_sessions(mdsc, lock_unlock_session, false);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 724307ff89cd..86d2965e68a1 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -380,6 +380,11 @@  struct cap_wait {
 	int			want;
 };
 
+enum {
+       CEPH_MDSC_STOPPING_BEGIN = 1,
+       CEPH_MDSC_STOPPING_FLUSHED = 2,
+};
+
 /*
  * mds client state
  */
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 3fc48b43cab0..a5f52013314d 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -1374,6 +1374,16 @@  static void ceph_kill_sb(struct super_block *s)
 	ceph_mdsc_pre_umount(fsc->mdsc);
 	flush_fs_workqueues(fsc);
 
+	/*
+	 * Though the kill_anon_super() will finally trigger the
+	 * sync_filesystem() anyway, we still need to do it here
+	 * and then bump the stage of shutdown to stop the work
+	 * queue as earlier as possible.
+	 */
+	sync_filesystem(s);
+
+	fsc->mdsc->stopping = CEPH_MDSC_STOPPING_FLUSHED;
+
 	kill_anon_super(s);
 
 	fsc->client->extra_mon_dispatch = NULL;