Message ID | 20231025061952.288730-3-xiubli@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | ceph: fix caps revocation stuck | expand |
On Wed, Oct 25, 2023 at 8:22 AM <xiubli@redhat.com> wrote: > > From: Xiubo Li <xiubli@redhat.com> > > When unlinking a file the check caps could be delayed for more than > 5 seconds, but in MDS side it maybe waiting for the clients to > release caps. > > This will add a dedicated work queue and list to help trigger to > fire the check caps and dirty buffer flushing immediately. > > URL: https://tracker.ceph.com/issues/50223 > Signed-off-by: Xiubo Li <xiubli@redhat.com> > --- > fs/ceph/caps.c | 17 ++++++++++++++++- > fs/ceph/mds_client.c | 34 ++++++++++++++++++++++++++++++++++ > fs/ceph/mds_client.h | 4 ++++ > 3 files changed, 54 insertions(+), 1 deletion(-) > > diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c > index 9b9ec1adc19d..be4f986e082d 100644 > --- a/fs/ceph/caps.c > +++ b/fs/ceph/caps.c > @@ -4790,7 +4790,22 @@ int ceph_drop_caps_for_unlink(struct inode *inode) > if (__ceph_caps_dirty(ci)) { > struct ceph_mds_client *mdsc = > ceph_inode_to_fs_client(inode)->mdsc; > - __cap_delay_requeue_front(mdsc, ci); > + > + doutc(mdsc->fsc->client, "%p %llx.%llx\n", inode, > + ceph_vinop(inode)); > + spin_lock(&mdsc->cap_unlink_delay_lock); > + ci->i_ceph_flags |= CEPH_I_FLUSH; > + if (!list_empty(&ci->i_cap_delay_list)) > + list_del_init(&ci->i_cap_delay_list); > + list_add_tail(&ci->i_cap_delay_list, > + &mdsc->cap_unlink_delay_list); > + spin_unlock(&mdsc->cap_unlink_delay_lock); > + > + /* > + * Fire the work immediately, because the MDS maybe > + * waiting for caps release. > + */ > + schedule_work(&mdsc->cap_unlink_work); Hi Xiubo, This schedules a work an a system-wide workqueue, not specific to CephFS. Is there something that ensures that it gets flushed as part of unmount and possibly on other occasions that have to do with individual inodes? Thanks, Ilya
On 1/16/24 06:44, Ilya Dryomov wrote: > On Wed, Oct 25, 2023 at 8:22 AM <xiubli@redhat.com> wrote: >> From: Xiubo Li <xiubli@redhat.com> >> >> When unlinking a file the check caps could be delayed for more than >> 5 seconds, but in MDS side it maybe waiting for the clients to >> release caps. >> >> This will add a dedicated work queue and list to help trigger to >> fire the check caps and dirty buffer flushing immediately. >> >> URL: https://tracker.ceph.com/issues/50223 >> Signed-off-by: Xiubo Li <xiubli@redhat.com> >> --- >> fs/ceph/caps.c | 17 ++++++++++++++++- >> fs/ceph/mds_client.c | 34 ++++++++++++++++++++++++++++++++++ >> fs/ceph/mds_client.h | 4 ++++ >> 3 files changed, 54 insertions(+), 1 deletion(-) >> >> diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c >> index 9b9ec1adc19d..be4f986e082d 100644 >> --- a/fs/ceph/caps.c >> +++ b/fs/ceph/caps.c >> @@ -4790,7 +4790,22 @@ int ceph_drop_caps_for_unlink(struct inode *inode) >> if (__ceph_caps_dirty(ci)) { >> struct ceph_mds_client *mdsc = >> ceph_inode_to_fs_client(inode)->mdsc; >> - __cap_delay_requeue_front(mdsc, ci); >> + >> + doutc(mdsc->fsc->client, "%p %llx.%llx\n", inode, >> + ceph_vinop(inode)); >> + spin_lock(&mdsc->cap_unlink_delay_lock); >> + ci->i_ceph_flags |= CEPH_I_FLUSH; >> + if (!list_empty(&ci->i_cap_delay_list)) >> + list_del_init(&ci->i_cap_delay_list); >> + list_add_tail(&ci->i_cap_delay_list, >> + &mdsc->cap_unlink_delay_list); >> + spin_unlock(&mdsc->cap_unlink_delay_lock); >> + >> + /* >> + * Fire the work immediately, because the MDS maybe >> + * waiting for caps release. >> + */ >> + schedule_work(&mdsc->cap_unlink_work); > Hi Xiubo, > > This schedules a work an a system-wide workqueue, not specific to > CephFS. Is there something that ensures that it gets flushed as part > of unmount and possibly on other occasions that have to do with > individual inodes? Hi Ilya, There seems no, I didn't find any. Let me add a dedicated workqueue for this. Thanks - Xiubo > Thanks, > > Ilya >
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 9b9ec1adc19d..be4f986e082d 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -4790,7 +4790,22 @@ int ceph_drop_caps_for_unlink(struct inode *inode) if (__ceph_caps_dirty(ci)) { struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc; - __cap_delay_requeue_front(mdsc, ci); + + doutc(mdsc->fsc->client, "%p %llx.%llx\n", inode, + ceph_vinop(inode)); + spin_lock(&mdsc->cap_unlink_delay_lock); + ci->i_ceph_flags |= CEPH_I_FLUSH; + if (!list_empty(&ci->i_cap_delay_list)) + list_del_init(&ci->i_cap_delay_list); + list_add_tail(&ci->i_cap_delay_list, + &mdsc->cap_unlink_delay_list); + spin_unlock(&mdsc->cap_unlink_delay_lock); + + /* + * Fire the work immediately, because the MDS maybe + * waiting for caps release. + */ + schedule_work(&mdsc->cap_unlink_work); } } spin_unlock(&ci->i_ceph_lock); diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 6fa7134beec8..a7bffb030036 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2500,6 +2500,37 @@ void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr) } } +static void ceph_cap_unlink_work(struct work_struct *work) +{ + struct ceph_mds_client *mdsc = + container_of(work, struct ceph_mds_client, cap_unlink_work); + struct ceph_client *cl = mdsc->fsc->client; + + doutc(cl, "begin\n"); + spin_lock(&mdsc->cap_unlink_delay_lock); + while (!list_empty(&mdsc->cap_unlink_delay_list)) { + struct ceph_inode_info *ci; + struct inode *inode; + + ci = list_first_entry(&mdsc->cap_unlink_delay_list, + struct ceph_inode_info, + i_cap_delay_list); + list_del_init(&ci->i_cap_delay_list); + + inode = igrab(&ci->netfs.inode); + if (inode) { + spin_unlock(&mdsc->cap_unlink_delay_lock); + doutc(cl, "on %p %llx.%llx\n", inode, + ceph_vinop(inode)); + ceph_check_caps(ci, CHECK_CAPS_FLUSH); + iput(inode); + spin_lock(&mdsc->cap_unlink_delay_lock); + } + } + spin_unlock(&mdsc->cap_unlink_delay_lock); + doutc(cl, "done\n"); +} + /* * requests */ @@ -5372,6 +5403,8 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) INIT_LIST_HEAD(&mdsc->cap_delay_list); INIT_LIST_HEAD(&mdsc->cap_wait_list); spin_lock_init(&mdsc->cap_delay_lock); + INIT_LIST_HEAD(&mdsc->cap_unlink_delay_list); + spin_lock_init(&mdsc->cap_unlink_delay_lock); INIT_LIST_HEAD(&mdsc->snap_flush_list); spin_lock_init(&mdsc->snap_flush_lock); mdsc->last_cap_flush_tid = 1; @@ -5380,6 +5413,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) spin_lock_init(&mdsc->cap_dirty_lock); init_waitqueue_head(&mdsc->cap_flushing_wq); INIT_WORK(&mdsc->cap_reclaim_work, ceph_cap_reclaim_work); + INIT_WORK(&mdsc->cap_unlink_work, ceph_cap_unlink_work); err = ceph_metric_init(&mdsc->metric); if (err) goto err_mdsmap; diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 2e6ddaa13d72..f25117fa910f 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -462,6 +462,8 @@ struct ceph_mds_client { unsigned long last_renew_caps; /* last time we renewed our caps */ struct list_head cap_delay_list; /* caps with delayed release */ spinlock_t cap_delay_lock; /* protects cap_delay_list */ + struct list_head cap_unlink_delay_list; /* caps with delayed release for unlink */ + spinlock_t cap_unlink_delay_lock; /* protects cap_unlink_delay_list */ struct list_head snap_flush_list; /* cap_snaps ready to flush */ spinlock_t snap_flush_lock; @@ -475,6 +477,8 @@ struct ceph_mds_client { struct work_struct cap_reclaim_work; atomic_t cap_reclaim_pending; + struct work_struct cap_unlink_work; + /* * Cap reservations *