diff mbox series

[v2,1/2] ceph: add ceph_async_put_cap_refs to avoid double lock and deadlock

Message ID 1590405385-27406-2-git-send-email-xiubli@redhat.com (mailing list archive)
State New, archived
Headers show
Series ceph: fix dead lock and double lock | expand

Commit Message

Xiubo Li May 25, 2020, 11:16 a.m. UTC
From: Xiubo Li <xiubli@redhat.com>

In the ceph_check_caps() it may call the session lock/unlock stuff.

There have some deadlock cases, like:
handle_forward()
...
mutex_lock(&mdsc->mutex)
...
ceph_mdsc_put_request()
  --> ceph_mdsc_release_request()
    --> ceph_put_cap_request()
      --> ceph_put_cap_refs()
        --> ceph_check_caps()
...
mutex_unlock(&mdsc->mutex)

And also there maybe has some double session lock cases, like:

send_mds_reconnect()
...
mutex_lock(&session->s_mutex);
...
  --> replay_unsafe_requests()
    --> ceph_mdsc_release_dir_caps()
      --> ceph_put_cap_refs()
        --> ceph_check_caps()
...
mutex_unlock(&session->s_mutex);

URL: https://tracker.ceph.com/issues/45635
Signed-off-by: Xiubo Li <xiubli@redhat.com>
---
 fs/ceph/caps.c       | 29 +++++++++++++++++++++++++++++
 fs/ceph/inode.c      |  3 +++
 fs/ceph/mds_client.c | 12 +++++++-----
 fs/ceph/super.h      |  5 +++++
 4 files changed, 44 insertions(+), 5 deletions(-)

Comments

Yan, Zheng May 26, 2020, 3:11 a.m. UTC | #1
On 5/25/20 7:16 PM, xiubli@redhat.com wrote:
> From: Xiubo Li <xiubli@redhat.com>
> 
> In the ceph_check_caps() it may call the session lock/unlock stuff.
> 
> There have some deadlock cases, like:
> handle_forward()
> ...
> mutex_lock(&mdsc->mutex)
> ...
> ceph_mdsc_put_request()
>    --> ceph_mdsc_release_request()
>      --> ceph_put_cap_request()
>        --> ceph_put_cap_refs()
>          --> ceph_check_caps()
> ...
> mutex_unlock(&mdsc->mutex)
> 
> And also there maybe has some double session lock cases, like:
> 
> send_mds_reconnect()
> ...
> mutex_lock(&session->s_mutex);
> ...
>    --> replay_unsafe_requests()
>      --> ceph_mdsc_release_dir_caps()
>        --> ceph_put_cap_refs()
>          --> ceph_check_caps()
> ...
> mutex_unlock(&session->s_mutex);
> 
> URL: https://tracker.ceph.com/issues/45635
> Signed-off-by: Xiubo Li <xiubli@redhat.com>
> ---
>   fs/ceph/caps.c       | 29 +++++++++++++++++++++++++++++
>   fs/ceph/inode.c      |  3 +++
>   fs/ceph/mds_client.c | 12 +++++++-----
>   fs/ceph/super.h      |  5 +++++
>   4 files changed, 44 insertions(+), 5 deletions(-)
> 
> diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> index 27c2e60..aea66c1 100644
> --- a/fs/ceph/caps.c
> +++ b/fs/ceph/caps.c
> @@ -3082,6 +3082,35 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
>   		iput(inode);
>   }
>   
> +void ceph_async_put_cap_refs_work(struct work_struct *work)
> +{
> +	struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
> +						  put_cap_refs_work);
> +	int caps;
> +
> +	spin_lock(&ci->i_ceph_lock);
> +	caps = xchg(&ci->pending_put_caps, 0);
> +	spin_unlock(&ci->i_ceph_lock);
> +
> +	ceph_put_cap_refs(ci, caps);
> +}
> +
> +void ceph_async_put_cap_refs(struct ceph_inode_info *ci, int had)
> +{
> +	struct inode *inode = &ci->vfs_inode;
> +
> +	spin_lock(&ci->i_ceph_lock);
> +	if (ci->pending_put_caps & had) {
> +	        spin_unlock(&ci->i_ceph_lock);
> +		return;
> +	}

this will cause cap ref leak.

I thought about this again. all the trouble is caused by calling 
ceph_mdsc_release_dir_caps() inside ceph_mdsc_release_request().
We already call ceph_mdsc_release_dir_caps() in ceph_async_foo_cb() for 
normal circumdtance. We just need to call ceph_mdsc_release_dir_caps() 
in 'session closed' case (we never abort async request). In the 'session 
closed' case, we can use ceph_put_cap_refs_no_check_caps()

Regards
Yan, Zheng

> +
> +	ci->pending_put_caps |= had;
> +	spin_unlock(&ci->i_ceph_lock);
> +
> +	queue_work(ceph_inode_to_client(inode)->inode_wq,
> +		   &ci->put_cap_refs_work);
> +}
>   /*
>    * Release @nr WRBUFFER refs on dirty pages for the given @snapc snap
>    * context.  Adjust per-snap dirty page accounting as appropriate.
> diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> index 357c937..303276a 100644
> --- a/fs/ceph/inode.c
> +++ b/fs/ceph/inode.c
> @@ -517,6 +517,9 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
>   	INIT_LIST_HEAD(&ci->i_snap_realm_item);
>   	INIT_LIST_HEAD(&ci->i_snap_flush_item);
>   
> +	INIT_WORK(&ci->put_cap_refs_work, ceph_async_put_cap_refs_work);
> +	ci->pending_put_caps = 0;
> +
>   	INIT_WORK(&ci->i_work, ceph_inode_work);
>   	ci->i_work_mask = 0;
>   	memset(&ci->i_btime, '\0', sizeof(ci->i_btime));
> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> index 0e0ab01..40b31da 100644
> --- a/fs/ceph/mds_client.c
> +++ b/fs/ceph/mds_client.c
> @@ -811,12 +811,14 @@ void ceph_mdsc_release_request(struct kref *kref)
>   	if (req->r_reply)
>   		ceph_msg_put(req->r_reply);
>   	if (req->r_inode) {
> -		ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
> +		ceph_async_put_cap_refs(ceph_inode(req->r_inode),
> +					CEPH_CAP_PIN);
>   		/* avoid calling iput_final() in mds dispatch threads */
>   		ceph_async_iput(req->r_inode);
>   	}
>   	if (req->r_parent) {
> -		ceph_put_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN);
> +		ceph_async_put_cap_refs(ceph_inode(req->r_parent),
> +					CEPH_CAP_PIN);
>   		ceph_async_iput(req->r_parent);
>   	}
>   	ceph_async_iput(req->r_target_inode);
> @@ -831,8 +833,8 @@ void ceph_mdsc_release_request(struct kref *kref)
>   		 * changed between the dir mutex being dropped and
>   		 * this request being freed.
>   		 */
> -		ceph_put_cap_refs(ceph_inode(req->r_old_dentry_dir),
> -				  CEPH_CAP_PIN);
> +		ceph_async_put_cap_refs(ceph_inode(req->r_old_dentry_dir),
> +					CEPH_CAP_PIN);
>   		ceph_async_iput(req->r_old_dentry_dir);
>   	}
>   	kfree(req->r_path1);
> @@ -3398,7 +3400,7 @@ void ceph_mdsc_release_dir_caps(struct ceph_mds_request *req)
>   	dcaps = xchg(&req->r_dir_caps, 0);
>   	if (dcaps) {
>   		dout("releasing r_dir_caps=%s\n", ceph_cap_string(dcaps));
> -		ceph_put_cap_refs(ceph_inode(req->r_parent), dcaps);
> +		ceph_async_put_cap_refs(ceph_inode(req->r_parent), dcaps);
>   	}
>   }
>   
> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> index 226f19c..01d206f 100644
> --- a/fs/ceph/super.h
> +++ b/fs/ceph/super.h
> @@ -421,6 +421,9 @@ struct ceph_inode_info {
>   	struct timespec64 i_btime;
>   	struct timespec64 i_snap_btime;
>   
> +	struct work_struct put_cap_refs_work;
> +	int pending_put_caps;
> +
>   	struct work_struct i_work;
>   	unsigned long  i_work_mask;
>   
> @@ -1095,6 +1098,8 @@ extern void ceph_take_cap_refs(struct ceph_inode_info *ci, int caps,
>   				bool snap_rwsem_locked);
>   extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps);
>   extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had);
> +extern void ceph_async_put_cap_refs(struct ceph_inode_info *ci, int had);
> +extern void ceph_async_put_cap_refs_work(struct work_struct *work);
>   extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
>   				       struct ceph_snap_context *snapc);
>   extern void ceph_flush_snaps(struct ceph_inode_info *ci,
>
Xiubo Li May 26, 2020, 3:38 a.m. UTC | #2
On 2020/5/26 11:11, Yan, Zheng wrote:
> On 5/25/20 7:16 PM, xiubli@redhat.com wrote:
>> From: Xiubo Li <xiubli@redhat.com>
>>
>> In the ceph_check_caps() it may call the session lock/unlock stuff.
>>
>> There have some deadlock cases, like:
>> handle_forward()
>> ...
>> mutex_lock(&mdsc->mutex)
>> ...
>> ceph_mdsc_put_request()
>>    --> ceph_mdsc_release_request()
>>      --> ceph_put_cap_request()
>>        --> ceph_put_cap_refs()
>>          --> ceph_check_caps()
>> ...
>> mutex_unlock(&mdsc->mutex)
>>
>> And also there maybe has some double session lock cases, like:
>>
>> send_mds_reconnect()
>> ...
>> mutex_lock(&session->s_mutex);
>> ...
>>    --> replay_unsafe_requests()
>>      --> ceph_mdsc_release_dir_caps()
>>        --> ceph_put_cap_refs()
>>          --> ceph_check_caps()
>> ...
>> mutex_unlock(&session->s_mutex);
>>
>> URL: https://tracker.ceph.com/issues/45635
>> Signed-off-by: Xiubo Li <xiubli@redhat.com>
>> ---
>>   fs/ceph/caps.c       | 29 +++++++++++++++++++++++++++++
>>   fs/ceph/inode.c      |  3 +++
>>   fs/ceph/mds_client.c | 12 +++++++-----
>>   fs/ceph/super.h      |  5 +++++
>>   4 files changed, 44 insertions(+), 5 deletions(-)
>>
>> diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
>> index 27c2e60..aea66c1 100644
>> --- a/fs/ceph/caps.c
>> +++ b/fs/ceph/caps.c
>> @@ -3082,6 +3082,35 @@ void ceph_put_cap_refs(struct ceph_inode_info 
>> *ci, int had)
>>           iput(inode);
>>   }
>>   +void ceph_async_put_cap_refs_work(struct work_struct *work)
>> +{
>> +    struct ceph_inode_info *ci = container_of(work, struct 
>> ceph_inode_info,
>> +                          put_cap_refs_work);
>> +    int caps;
>> +
>> +    spin_lock(&ci->i_ceph_lock);
>> +    caps = xchg(&ci->pending_put_caps, 0);
>> +    spin_unlock(&ci->i_ceph_lock);
>> +
>> +    ceph_put_cap_refs(ci, caps);
>> +}
>> +
>> +void ceph_async_put_cap_refs(struct ceph_inode_info *ci, int had)
>> +{
>> +    struct inode *inode = &ci->vfs_inode;
>> +
>> +    spin_lock(&ci->i_ceph_lock);
>> +    if (ci->pending_put_caps & had) {
>> +            spin_unlock(&ci->i_ceph_lock);
>> +        return;
>> +    }
>
> this will cause cap ref leak.

Ah, yeah, right.


>
> I thought about this again. all the trouble is caused by calling 
> ceph_mdsc_release_dir_caps() inside ceph_mdsc_release_request().

And also in ceph_mdsc_release_request() it is calling 
ceph_put_cap_refs() directly in other 3 places.

BRs

Xiubo

> We already call ceph_mdsc_release_dir_caps() in ceph_async_foo_cb() 
> for normal circumdtance. We just need to call 
> ceph_mdsc_release_dir_caps() in 'session closed' case (we never abort 
> async request). In the 'session closed' case, we can use 
> ceph_put_cap_refs_no_check_caps()
>
> Regards
> Yan, Zheng
>
>> +
>> +    ci->pending_put_caps |= had;
>> +    spin_unlock(&ci->i_ceph_lock);
>> +
>> +    queue_work(ceph_inode_to_client(inode)->inode_wq,
>> +           &ci->put_cap_refs_work);
>> +}
>>   /*
>>    * Release @nr WRBUFFER refs on dirty pages for the given @snapc snap
>>    * context.  Adjust per-snap dirty page accounting as appropriate.
>> diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
>> index 357c937..303276a 100644
>> --- a/fs/ceph/inode.c
>> +++ b/fs/ceph/inode.c
>> @@ -517,6 +517,9 @@ struct inode *ceph_alloc_inode(struct super_block 
>> *sb)
>>       INIT_LIST_HEAD(&ci->i_snap_realm_item);
>>       INIT_LIST_HEAD(&ci->i_snap_flush_item);
>>   +    INIT_WORK(&ci->put_cap_refs_work, ceph_async_put_cap_refs_work);
>> +    ci->pending_put_caps = 0;
>> +
>>       INIT_WORK(&ci->i_work, ceph_inode_work);
>>       ci->i_work_mask = 0;
>>       memset(&ci->i_btime, '\0', sizeof(ci->i_btime));
>> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
>> index 0e0ab01..40b31da 100644
>> --- a/fs/ceph/mds_client.c
>> +++ b/fs/ceph/mds_client.c
>> @@ -811,12 +811,14 @@ void ceph_mdsc_release_request(struct kref *kref)
>>       if (req->r_reply)
>>           ceph_msg_put(req->r_reply);
>>       if (req->r_inode) {
>> -        ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
>> +        ceph_async_put_cap_refs(ceph_inode(req->r_inode),
>> +                    CEPH_CAP_PIN);
>>           /* avoid calling iput_final() in mds dispatch threads */
>>           ceph_async_iput(req->r_inode);
>>       }
>>       if (req->r_parent) {
>> -        ceph_put_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN);
>> +        ceph_async_put_cap_refs(ceph_inode(req->r_parent),
>> +                    CEPH_CAP_PIN);
>>           ceph_async_iput(req->r_parent);
>>       }
>>       ceph_async_iput(req->r_target_inode);
>> @@ -831,8 +833,8 @@ void ceph_mdsc_release_request(struct kref *kref)
>>            * changed between the dir mutex being dropped and
>>            * this request being freed.
>>            */
>> -        ceph_put_cap_refs(ceph_inode(req->r_old_dentry_dir),
>> -                  CEPH_CAP_PIN);
>> + ceph_async_put_cap_refs(ceph_inode(req->r_old_dentry_dir),
>> +                    CEPH_CAP_PIN);
>>           ceph_async_iput(req->r_old_dentry_dir);
>>       }
>>       kfree(req->r_path1);
>> @@ -3398,7 +3400,7 @@ void ceph_mdsc_release_dir_caps(struct 
>> ceph_mds_request *req)
>>       dcaps = xchg(&req->r_dir_caps, 0);
>>       if (dcaps) {
>>           dout("releasing r_dir_caps=%s\n", ceph_cap_string(dcaps));
>> -        ceph_put_cap_refs(ceph_inode(req->r_parent), dcaps);
>> +        ceph_async_put_cap_refs(ceph_inode(req->r_parent), dcaps);
>>       }
>>   }
>>   diff --git a/fs/ceph/super.h b/fs/ceph/super.h
>> index 226f19c..01d206f 100644
>> --- a/fs/ceph/super.h
>> +++ b/fs/ceph/super.h
>> @@ -421,6 +421,9 @@ struct ceph_inode_info {
>>       struct timespec64 i_btime;
>>       struct timespec64 i_snap_btime;
>>   +    struct work_struct put_cap_refs_work;
>> +    int pending_put_caps;
>> +
>>       struct work_struct i_work;
>>       unsigned long  i_work_mask;
>>   @@ -1095,6 +1098,8 @@ extern void ceph_take_cap_refs(struct 
>> ceph_inode_info *ci, int caps,
>>                   bool snap_rwsem_locked);
>>   extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps);
>>   extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had);
>> +extern void ceph_async_put_cap_refs(struct ceph_inode_info *ci, int 
>> had);
>> +extern void ceph_async_put_cap_refs_work(struct work_struct *work);
>>   extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, 
>> int nr,
>>                          struct ceph_snap_context *snapc);
>>   extern void ceph_flush_snaps(struct ceph_inode_info *ci,
>>
>
Xiubo Li May 26, 2020, 3:52 a.m. UTC | #3
On 2020/5/26 11:11, Yan, Zheng wrote:
> On 5/25/20 7:16 PM, xiubli@redhat.com wrote:
>> From: Xiubo Li <xiubli@redhat.com>
>>
>> In the ceph_check_caps() it may call the session lock/unlock stuff.
>>
>> There have some deadlock cases, like:
>> handle_forward()
>> ...
>> mutex_lock(&mdsc->mutex)
>> ...
>> ceph_mdsc_put_request()
>>    --> ceph_mdsc_release_request()
>>      --> ceph_put_cap_request()
>>        --> ceph_put_cap_refs()
>>          --> ceph_check_caps()
>> ...
>> mutex_unlock(&mdsc->mutex)
>>
>> And also there maybe has some double session lock cases, like:
>>
>> send_mds_reconnect()
>> ...
>> mutex_lock(&session->s_mutex);
>> ...
>>    --> replay_unsafe_requests()
>>      --> ceph_mdsc_release_dir_caps()
>>        --> ceph_put_cap_refs()
>>          --> ceph_check_caps()
>> ...
>> mutex_unlock(&session->s_mutex);
>>
>> URL: https://tracker.ceph.com/issues/45635
>> Signed-off-by: Xiubo Li <xiubli@redhat.com>
>> ---
>>   fs/ceph/caps.c       | 29 +++++++++++++++++++++++++++++
>>   fs/ceph/inode.c      |  3 +++
>>   fs/ceph/mds_client.c | 12 +++++++-----
>>   fs/ceph/super.h      |  5 +++++
>>   4 files changed, 44 insertions(+), 5 deletions(-)
>>
>> diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
>> index 27c2e60..aea66c1 100644
>> --- a/fs/ceph/caps.c
>> +++ b/fs/ceph/caps.c
>> @@ -3082,6 +3082,35 @@ void ceph_put_cap_refs(struct ceph_inode_info 
>> *ci, int had)
>>           iput(inode);
>>   }
>>   +void ceph_async_put_cap_refs_work(struct work_struct *work)
>> +{
>> +    struct ceph_inode_info *ci = container_of(work, struct 
>> ceph_inode_info,
>> +                          put_cap_refs_work);
>> +    int caps;
>> +
>> +    spin_lock(&ci->i_ceph_lock);
>> +    caps = xchg(&ci->pending_put_caps, 0);
>> +    spin_unlock(&ci->i_ceph_lock);
>> +
>> +    ceph_put_cap_refs(ci, caps);
>> +}
>> +
>> +void ceph_async_put_cap_refs(struct ceph_inode_info *ci, int had)
>> +{
>> +    struct inode *inode = &ci->vfs_inode;
>> +
>> +    spin_lock(&ci->i_ceph_lock);
>> +    if (ci->pending_put_caps & had) {
>> +            spin_unlock(&ci->i_ceph_lock);
>> +        return;
>> +    }
>
> this will cause cap ref leak.
>
> I thought about this again. all the trouble is caused by calling 
> ceph_mdsc_release_dir_caps() inside ceph_mdsc_release_request().
> We already call ceph_mdsc_release_dir_caps() in ceph_async_foo_cb() 
> for normal circumdtance. We just need to call 
> ceph_mdsc_release_dir_caps() in 'session closed' case (we never abort 
> async request). In the 'session closed' case, we can use 
> ceph_put_cap_refs_no_check_caps()
>
IMO, this ceph_async_put_cap_refs helper still needed, such as there 
have many other cases like:

cleanup_session_requests()

{

...
    --> mutex_lock(&mdsc->mutex)
...
    --> __unregister_request()
        --> ceph_mdsc_put_request()
            --> ceph_mdsc_release_request()
                --> ceph_put_cap_request()
                    --> ceph_put_cap_refs()
                        --> ceph_check_caps()  ===> will do the 
session->s_mutex lock/unlock
...
    --> mutex_unlock(&mdsc->mutex)

}


> Regards
> Yan, Zheng
>
>> +
>> +    ci->pending_put_caps |= had;
>> +    spin_unlock(&ci->i_ceph_lock);
>> +
>> +    queue_work(ceph_inode_to_client(inode)->inode_wq,
>> +           &ci->put_cap_refs_work);
>> +}
>>   /*
>>    * Release @nr WRBUFFER refs on dirty pages for the given @snapc snap
>>    * context.  Adjust per-snap dirty page accounting as appropriate.
>> diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
>> index 357c937..303276a 100644
>> --- a/fs/ceph/inode.c
>> +++ b/fs/ceph/inode.c
>> @@ -517,6 +517,9 @@ struct inode *ceph_alloc_inode(struct super_block 
>> *sb)
>>       INIT_LIST_HEAD(&ci->i_snap_realm_item);
>>       INIT_LIST_HEAD(&ci->i_snap_flush_item);
>>   +    INIT_WORK(&ci->put_cap_refs_work, ceph_async_put_cap_refs_work);
>> +    ci->pending_put_caps = 0;
>> +
>>       INIT_WORK(&ci->i_work, ceph_inode_work);
>>       ci->i_work_mask = 0;
>>       memset(&ci->i_btime, '\0', sizeof(ci->i_btime));
>> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
>> index 0e0ab01..40b31da 100644
>> --- a/fs/ceph/mds_client.c
>> +++ b/fs/ceph/mds_client.c
>> @@ -811,12 +811,14 @@ void ceph_mdsc_release_request(struct kref *kref)
>>       if (req->r_reply)
>>           ceph_msg_put(req->r_reply);
>>       if (req->r_inode) {
>> -        ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
>> +        ceph_async_put_cap_refs(ceph_inode(req->r_inode),
>> +                    CEPH_CAP_PIN);
>>           /* avoid calling iput_final() in mds dispatch threads */
>>           ceph_async_iput(req->r_inode);
>>       }
>>       if (req->r_parent) {
>> -        ceph_put_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN);
>> +        ceph_async_put_cap_refs(ceph_inode(req->r_parent),
>> +                    CEPH_CAP_PIN);
>>           ceph_async_iput(req->r_parent);
>>       }
>>       ceph_async_iput(req->r_target_inode);
>> @@ -831,8 +833,8 @@ void ceph_mdsc_release_request(struct kref *kref)
>>            * changed between the dir mutex being dropped and
>>            * this request being freed.
>>            */
>> -        ceph_put_cap_refs(ceph_inode(req->r_old_dentry_dir),
>> -                  CEPH_CAP_PIN);
>> + ceph_async_put_cap_refs(ceph_inode(req->r_old_dentry_dir),
>> +                    CEPH_CAP_PIN);
>>           ceph_async_iput(req->r_old_dentry_dir);
>>       }
>>       kfree(req->r_path1);
>> @@ -3398,7 +3400,7 @@ void ceph_mdsc_release_dir_caps(struct 
>> ceph_mds_request *req)
>>       dcaps = xchg(&req->r_dir_caps, 0);
>>       if (dcaps) {
>>           dout("releasing r_dir_caps=%s\n", ceph_cap_string(dcaps));
>> -        ceph_put_cap_refs(ceph_inode(req->r_parent), dcaps);
>> +        ceph_async_put_cap_refs(ceph_inode(req->r_parent), dcaps);
>>       }
>>   }
>>   diff --git a/fs/ceph/super.h b/fs/ceph/super.h
>> index 226f19c..01d206f 100644
>> --- a/fs/ceph/super.h
>> +++ b/fs/ceph/super.h
>> @@ -421,6 +421,9 @@ struct ceph_inode_info {
>>       struct timespec64 i_btime;
>>       struct timespec64 i_snap_btime;
>>   +    struct work_struct put_cap_refs_work;
>> +    int pending_put_caps;
>> +
>>       struct work_struct i_work;
>>       unsigned long  i_work_mask;
>>   @@ -1095,6 +1098,8 @@ extern void ceph_take_cap_refs(struct 
>> ceph_inode_info *ci, int caps,
>>                   bool snap_rwsem_locked);
>>   extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps);
>>   extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had);
>> +extern void ceph_async_put_cap_refs(struct ceph_inode_info *ci, int 
>> had);
>> +extern void ceph_async_put_cap_refs_work(struct work_struct *work);
>>   extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, 
>> int nr,
>>                          struct ceph_snap_context *snapc);
>>   extern void ceph_flush_snaps(struct ceph_inode_info *ci,
>>
>
Yan, Zheng May 26, 2020, 6:29 a.m. UTC | #4
On Tue, May 26, 2020 at 11:42 AM Xiubo Li <xiubli@redhat.com> wrote:
>
> On 2020/5/26 11:11, Yan, Zheng wrote:
> > On 5/25/20 7:16 PM, xiubli@redhat.com wrote:
> >> From: Xiubo Li <xiubli@redhat.com>
> >>
> >> In the ceph_check_caps() it may call the session lock/unlock stuff.
> >>
> >> There have some deadlock cases, like:
> >> handle_forward()
> >> ...
> >> mutex_lock(&mdsc->mutex)
> >> ...
> >> ceph_mdsc_put_request()
> >>    --> ceph_mdsc_release_request()
> >>      --> ceph_put_cap_request()
> >>        --> ceph_put_cap_refs()
> >>          --> ceph_check_caps()
> >> ...
> >> mutex_unlock(&mdsc->mutex)
> >>
> >> And also there maybe has some double session lock cases, like:
> >>
> >> send_mds_reconnect()
> >> ...
> >> mutex_lock(&session->s_mutex);
> >> ...
> >>    --> replay_unsafe_requests()
> >>      --> ceph_mdsc_release_dir_caps()
> >>        --> ceph_put_cap_refs()
> >>          --> ceph_check_caps()
> >> ...
> >> mutex_unlock(&session->s_mutex);
> >>
> >> URL: https://tracker.ceph.com/issues/45635
> >> Signed-off-by: Xiubo Li <xiubli@redhat.com>
> >> ---
> >>   fs/ceph/caps.c       | 29 +++++++++++++++++++++++++++++
> >>   fs/ceph/inode.c      |  3 +++
> >>   fs/ceph/mds_client.c | 12 +++++++-----
> >>   fs/ceph/super.h      |  5 +++++
> >>   4 files changed, 44 insertions(+), 5 deletions(-)
> >>
> >> diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> >> index 27c2e60..aea66c1 100644
> >> --- a/fs/ceph/caps.c
> >> +++ b/fs/ceph/caps.c
> >> @@ -3082,6 +3082,35 @@ void ceph_put_cap_refs(struct ceph_inode_info
> >> *ci, int had)
> >>           iput(inode);
> >>   }
> >>   +void ceph_async_put_cap_refs_work(struct work_struct *work)
> >> +{
> >> +    struct ceph_inode_info *ci = container_of(work, struct
> >> ceph_inode_info,
> >> +                          put_cap_refs_work);
> >> +    int caps;
> >> +
> >> +    spin_lock(&ci->i_ceph_lock);
> >> +    caps = xchg(&ci->pending_put_caps, 0);
> >> +    spin_unlock(&ci->i_ceph_lock);
> >> +
> >> +    ceph_put_cap_refs(ci, caps);
> >> +}
> >> +
> >> +void ceph_async_put_cap_refs(struct ceph_inode_info *ci, int had)
> >> +{
> >> +    struct inode *inode = &ci->vfs_inode;
> >> +
> >> +    spin_lock(&ci->i_ceph_lock);
> >> +    if (ci->pending_put_caps & had) {
> >> +            spin_unlock(&ci->i_ceph_lock);
> >> +        return;
> >> +    }
> >
> > this will cause cap ref leak.
>
> Ah, yeah, right.
>
>
> >
> > I thought about this again. all the trouble is caused by calling
> > ceph_mdsc_release_dir_caps() inside ceph_mdsc_release_request().
>
> And also in ceph_mdsc_release_request() it is calling
> ceph_put_cap_refs() directly in other 3 places.
>

putting CEPH_CAP_PIN does not trigger check_caps(). So only
ceph_mdsc_release_dir_caps() matters.

> BRs
>
> Xiubo
>
> > We already call ceph_mdsc_release_dir_caps() in ceph_async_foo_cb()
> > for normal circumdtance. We just need to call
> > ceph_mdsc_release_dir_caps() in 'session closed' case (we never abort
> > async request). In the 'session closed' case, we can use
> > ceph_put_cap_refs_no_check_caps()
> >
> > Regards
> > Yan, Zheng
> >
> >> +
> >> +    ci->pending_put_caps |= had;
> >> +    spin_unlock(&ci->i_ceph_lock);
> >> +
> >> +    queue_work(ceph_inode_to_client(inode)->inode_wq,
> >> +           &ci->put_cap_refs_work);
> >> +}
> >>   /*
> >>    * Release @nr WRBUFFER refs on dirty pages for the given @snapc snap
> >>    * context.  Adjust per-snap dirty page accounting as appropriate.
> >> diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> >> index 357c937..303276a 100644
> >> --- a/fs/ceph/inode.c
> >> +++ b/fs/ceph/inode.c
> >> @@ -517,6 +517,9 @@ struct inode *ceph_alloc_inode(struct super_block
> >> *sb)
> >>       INIT_LIST_HEAD(&ci->i_snap_realm_item);
> >>       INIT_LIST_HEAD(&ci->i_snap_flush_item);
> >>   +    INIT_WORK(&ci->put_cap_refs_work, ceph_async_put_cap_refs_work);
> >> +    ci->pending_put_caps = 0;
> >> +
> >>       INIT_WORK(&ci->i_work, ceph_inode_work);
> >>       ci->i_work_mask = 0;
> >>       memset(&ci->i_btime, '\0', sizeof(ci->i_btime));
> >> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> >> index 0e0ab01..40b31da 100644
> >> --- a/fs/ceph/mds_client.c
> >> +++ b/fs/ceph/mds_client.c
> >> @@ -811,12 +811,14 @@ void ceph_mdsc_release_request(struct kref *kref)
> >>       if (req->r_reply)
> >>           ceph_msg_put(req->r_reply);
> >>       if (req->r_inode) {
> >> -        ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
> >> +        ceph_async_put_cap_refs(ceph_inode(req->r_inode),
> >> +                    CEPH_CAP_PIN);
> >>           /* avoid calling iput_final() in mds dispatch threads */
> >>           ceph_async_iput(req->r_inode);
> >>       }
> >>       if (req->r_parent) {
> >> -        ceph_put_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN);
> >> +        ceph_async_put_cap_refs(ceph_inode(req->r_parent),
> >> +                    CEPH_CAP_PIN);
> >>           ceph_async_iput(req->r_parent);
> >>       }
> >>       ceph_async_iput(req->r_target_inode);
> >> @@ -831,8 +833,8 @@ void ceph_mdsc_release_request(struct kref *kref)
> >>            * changed between the dir mutex being dropped and
> >>            * this request being freed.
> >>            */
> >> -        ceph_put_cap_refs(ceph_inode(req->r_old_dentry_dir),
> >> -                  CEPH_CAP_PIN);
> >> + ceph_async_put_cap_refs(ceph_inode(req->r_old_dentry_dir),
> >> +                    CEPH_CAP_PIN);
> >>           ceph_async_iput(req->r_old_dentry_dir);
> >>       }
> >>       kfree(req->r_path1);
> >> @@ -3398,7 +3400,7 @@ void ceph_mdsc_release_dir_caps(struct
> >> ceph_mds_request *req)
> >>       dcaps = xchg(&req->r_dir_caps, 0);
> >>       if (dcaps) {
> >>           dout("releasing r_dir_caps=%s\n", ceph_cap_string(dcaps));
> >> -        ceph_put_cap_refs(ceph_inode(req->r_parent), dcaps);
> >> +        ceph_async_put_cap_refs(ceph_inode(req->r_parent), dcaps);
> >>       }
> >>   }
> >>   diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> >> index 226f19c..01d206f 100644
> >> --- a/fs/ceph/super.h
> >> +++ b/fs/ceph/super.h
> >> @@ -421,6 +421,9 @@ struct ceph_inode_info {
> >>       struct timespec64 i_btime;
> >>       struct timespec64 i_snap_btime;
> >>   +    struct work_struct put_cap_refs_work;
> >> +    int pending_put_caps;
> >> +
> >>       struct work_struct i_work;
> >>       unsigned long  i_work_mask;
> >>   @@ -1095,6 +1098,8 @@ extern void ceph_take_cap_refs(struct
> >> ceph_inode_info *ci, int caps,
> >>                   bool snap_rwsem_locked);
> >>   extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps);
> >>   extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had);
> >> +extern void ceph_async_put_cap_refs(struct ceph_inode_info *ci, int
> >> had);
> >> +extern void ceph_async_put_cap_refs_work(struct work_struct *work);
> >>   extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci,
> >> int nr,
> >>                          struct ceph_snap_context *snapc);
> >>   extern void ceph_flush_snaps(struct ceph_inode_info *ci,
> >>
> >
>
Xiubo Li May 26, 2020, 7:30 a.m. UTC | #5
On 2020/5/26 14:29, Yan, Zheng wrote:
> On Tue, May 26, 2020 at 11:42 AM Xiubo Li <xiubli@redhat.com> wrote:
>> On 2020/5/26 11:11, Yan, Zheng wrote:
>>> On 5/25/20 7:16 PM, xiubli@redhat.com wrote:
>>>> From: Xiubo Li <xiubli@redhat.com>
>>>>
>>>> In the ceph_check_caps() it may call the session lock/unlock stuff.
>>>>
>>>> There have some deadlock cases, like:
>>>> handle_forward()
>>>> ...
>>>> mutex_lock(&mdsc->mutex)
>>>> ...
>>>> ceph_mdsc_put_request()
>>>>     --> ceph_mdsc_release_request()
>>>>       --> ceph_put_cap_request()
>>>>         --> ceph_put_cap_refs()
>>>>           --> ceph_check_caps()
>>>> ...
>>>> mutex_unlock(&mdsc->mutex)
>>>>
>>>> And also there maybe has some double session lock cases, like:
>>>>
>>>> send_mds_reconnect()
>>>> ...
>>>> mutex_lock(&session->s_mutex);
>>>> ...
>>>>     --> replay_unsafe_requests()
>>>>       --> ceph_mdsc_release_dir_caps()
>>>>         --> ceph_put_cap_refs()
>>>>           --> ceph_check_caps()
>>>> ...
>>>> mutex_unlock(&session->s_mutex);
>>>>
>>>> URL: https://tracker.ceph.com/issues/45635
>>>> Signed-off-by: Xiubo Li <xiubli@redhat.com>
>>>> ---
>>>>    fs/ceph/caps.c       | 29 +++++++++++++++++++++++++++++
>>>>    fs/ceph/inode.c      |  3 +++
>>>>    fs/ceph/mds_client.c | 12 +++++++-----
>>>>    fs/ceph/super.h      |  5 +++++
>>>>    4 files changed, 44 insertions(+), 5 deletions(-)
>>>>
>>>> diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
>>>> index 27c2e60..aea66c1 100644
>>>> --- a/fs/ceph/caps.c
>>>> +++ b/fs/ceph/caps.c
>>>> @@ -3082,6 +3082,35 @@ void ceph_put_cap_refs(struct ceph_inode_info
>>>> *ci, int had)
>>>>            iput(inode);
>>>>    }
>>>>    +void ceph_async_put_cap_refs_work(struct work_struct *work)
>>>> +{
>>>> +    struct ceph_inode_info *ci = container_of(work, struct
>>>> ceph_inode_info,
>>>> +                          put_cap_refs_work);
>>>> +    int caps;
>>>> +
>>>> +    spin_lock(&ci->i_ceph_lock);
>>>> +    caps = xchg(&ci->pending_put_caps, 0);
>>>> +    spin_unlock(&ci->i_ceph_lock);
>>>> +
>>>> +    ceph_put_cap_refs(ci, caps);
>>>> +}
>>>> +
>>>> +void ceph_async_put_cap_refs(struct ceph_inode_info *ci, int had)
>>>> +{
>>>> +    struct inode *inode = &ci->vfs_inode;
>>>> +
>>>> +    spin_lock(&ci->i_ceph_lock);
>>>> +    if (ci->pending_put_caps & had) {
>>>> +            spin_unlock(&ci->i_ceph_lock);
>>>> +        return;
>>>> +    }
>>> this will cause cap ref leak.
>> Ah, yeah, right.
>>
>>
>>> I thought about this again. all the trouble is caused by calling
>>> ceph_mdsc_release_dir_caps() inside ceph_mdsc_release_request().
>> And also in ceph_mdsc_release_request() it is calling
>> ceph_put_cap_refs() directly in other 3 places.
>>
> putting CEPH_CAP_PIN does not trigger check_caps(). So only
> ceph_mdsc_release_dir_caps() matters.

Yeah. Right. I will fix this.

Thanks

BRs

>> BRs
>>
>> Xiubo
>>
>>> We already call ceph_mdsc_release_dir_caps() in ceph_async_foo_cb()
>>> for normal circumdtance. We just need to call
>>> ceph_mdsc_release_dir_caps() in 'session closed' case (we never abort
>>> async request). In the 'session closed' case, we can use
>>> ceph_put_cap_refs_no_check_caps()
>>>
>>> Regards
>>> Yan, Zheng
>>>
>>>> +
>>>> +    ci->pending_put_caps |= had;
>>>> +    spin_unlock(&ci->i_ceph_lock);
>>>> +
>>>> +    queue_work(ceph_inode_to_client(inode)->inode_wq,
>>>> +           &ci->put_cap_refs_work);
>>>> +}
>>>>    /*
>>>>     * Release @nr WRBUFFER refs on dirty pages for the given @snapc snap
>>>>     * context.  Adjust per-snap dirty page accounting as appropriate.
>>>> diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
>>>> index 357c937..303276a 100644
>>>> --- a/fs/ceph/inode.c
>>>> +++ b/fs/ceph/inode.c
>>>> @@ -517,6 +517,9 @@ struct inode *ceph_alloc_inode(struct super_block
>>>> *sb)
>>>>        INIT_LIST_HEAD(&ci->i_snap_realm_item);
>>>>        INIT_LIST_HEAD(&ci->i_snap_flush_item);
>>>>    +    INIT_WORK(&ci->put_cap_refs_work, ceph_async_put_cap_refs_work);
>>>> +    ci->pending_put_caps = 0;
>>>> +
>>>>        INIT_WORK(&ci->i_work, ceph_inode_work);
>>>>        ci->i_work_mask = 0;
>>>>        memset(&ci->i_btime, '\0', sizeof(ci->i_btime));
>>>> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
>>>> index 0e0ab01..40b31da 100644
>>>> --- a/fs/ceph/mds_client.c
>>>> +++ b/fs/ceph/mds_client.c
>>>> @@ -811,12 +811,14 @@ void ceph_mdsc_release_request(struct kref *kref)
>>>>        if (req->r_reply)
>>>>            ceph_msg_put(req->r_reply);
>>>>        if (req->r_inode) {
>>>> -        ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
>>>> +        ceph_async_put_cap_refs(ceph_inode(req->r_inode),
>>>> +                    CEPH_CAP_PIN);
>>>>            /* avoid calling iput_final() in mds dispatch threads */
>>>>            ceph_async_iput(req->r_inode);
>>>>        }
>>>>        if (req->r_parent) {
>>>> -        ceph_put_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN);
>>>> +        ceph_async_put_cap_refs(ceph_inode(req->r_parent),
>>>> +                    CEPH_CAP_PIN);
>>>>            ceph_async_iput(req->r_parent);
>>>>        }
>>>>        ceph_async_iput(req->r_target_inode);
>>>> @@ -831,8 +833,8 @@ void ceph_mdsc_release_request(struct kref *kref)
>>>>             * changed between the dir mutex being dropped and
>>>>             * this request being freed.
>>>>             */
>>>> -        ceph_put_cap_refs(ceph_inode(req->r_old_dentry_dir),
>>>> -                  CEPH_CAP_PIN);
>>>> + ceph_async_put_cap_refs(ceph_inode(req->r_old_dentry_dir),
>>>> +                    CEPH_CAP_PIN);
>>>>            ceph_async_iput(req->r_old_dentry_dir);
>>>>        }
>>>>        kfree(req->r_path1);
>>>> @@ -3398,7 +3400,7 @@ void ceph_mdsc_release_dir_caps(struct
>>>> ceph_mds_request *req)
>>>>        dcaps = xchg(&req->r_dir_caps, 0);
>>>>        if (dcaps) {
>>>>            dout("releasing r_dir_caps=%s\n", ceph_cap_string(dcaps));
>>>> -        ceph_put_cap_refs(ceph_inode(req->r_parent), dcaps);
>>>> +        ceph_async_put_cap_refs(ceph_inode(req->r_parent), dcaps);
>>>>        }
>>>>    }
>>>>    diff --git a/fs/ceph/super.h b/fs/ceph/super.h
>>>> index 226f19c..01d206f 100644
>>>> --- a/fs/ceph/super.h
>>>> +++ b/fs/ceph/super.h
>>>> @@ -421,6 +421,9 @@ struct ceph_inode_info {
>>>>        struct timespec64 i_btime;
>>>>        struct timespec64 i_snap_btime;
>>>>    +    struct work_struct put_cap_refs_work;
>>>> +    int pending_put_caps;
>>>> +
>>>>        struct work_struct i_work;
>>>>        unsigned long  i_work_mask;
>>>>    @@ -1095,6 +1098,8 @@ extern void ceph_take_cap_refs(struct
>>>> ceph_inode_info *ci, int caps,
>>>>                    bool snap_rwsem_locked);
>>>>    extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps);
>>>>    extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had);
>>>> +extern void ceph_async_put_cap_refs(struct ceph_inode_info *ci, int
>>>> had);
>>>> +extern void ceph_async_put_cap_refs_work(struct work_struct *work);
>>>>    extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci,
>>>> int nr,
>>>>                           struct ceph_snap_context *snapc);
>>>>    extern void ceph_flush_snaps(struct ceph_inode_info *ci,
>>>>
diff mbox series

Patch

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 27c2e60..aea66c1 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -3082,6 +3082,35 @@  void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
 		iput(inode);
 }
 
+void ceph_async_put_cap_refs_work(struct work_struct *work)
+{
+	struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
+						  put_cap_refs_work);
+	int caps;
+
+	spin_lock(&ci->i_ceph_lock);
+	caps = xchg(&ci->pending_put_caps, 0);
+	spin_unlock(&ci->i_ceph_lock);
+
+	ceph_put_cap_refs(ci, caps);
+}
+
+void ceph_async_put_cap_refs(struct ceph_inode_info *ci, int had)
+{
+	struct inode *inode = &ci->vfs_inode;
+
+	spin_lock(&ci->i_ceph_lock);
+	if (ci->pending_put_caps & had) {
+	        spin_unlock(&ci->i_ceph_lock);
+		return;
+	}
+
+	ci->pending_put_caps |= had;
+	spin_unlock(&ci->i_ceph_lock);
+
+	queue_work(ceph_inode_to_client(inode)->inode_wq,
+		   &ci->put_cap_refs_work);
+}
 /*
  * Release @nr WRBUFFER refs on dirty pages for the given @snapc snap
  * context.  Adjust per-snap dirty page accounting as appropriate.
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 357c937..303276a 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -517,6 +517,9 @@  struct inode *ceph_alloc_inode(struct super_block *sb)
 	INIT_LIST_HEAD(&ci->i_snap_realm_item);
 	INIT_LIST_HEAD(&ci->i_snap_flush_item);
 
+	INIT_WORK(&ci->put_cap_refs_work, ceph_async_put_cap_refs_work);
+	ci->pending_put_caps = 0;
+
 	INIT_WORK(&ci->i_work, ceph_inode_work);
 	ci->i_work_mask = 0;
 	memset(&ci->i_btime, '\0', sizeof(ci->i_btime));
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 0e0ab01..40b31da 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -811,12 +811,14 @@  void ceph_mdsc_release_request(struct kref *kref)
 	if (req->r_reply)
 		ceph_msg_put(req->r_reply);
 	if (req->r_inode) {
-		ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
+		ceph_async_put_cap_refs(ceph_inode(req->r_inode),
+					CEPH_CAP_PIN);
 		/* avoid calling iput_final() in mds dispatch threads */
 		ceph_async_iput(req->r_inode);
 	}
 	if (req->r_parent) {
-		ceph_put_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN);
+		ceph_async_put_cap_refs(ceph_inode(req->r_parent),
+					CEPH_CAP_PIN);
 		ceph_async_iput(req->r_parent);
 	}
 	ceph_async_iput(req->r_target_inode);
@@ -831,8 +833,8 @@  void ceph_mdsc_release_request(struct kref *kref)
 		 * changed between the dir mutex being dropped and
 		 * this request being freed.
 		 */
-		ceph_put_cap_refs(ceph_inode(req->r_old_dentry_dir),
-				  CEPH_CAP_PIN);
+		ceph_async_put_cap_refs(ceph_inode(req->r_old_dentry_dir),
+					CEPH_CAP_PIN);
 		ceph_async_iput(req->r_old_dentry_dir);
 	}
 	kfree(req->r_path1);
@@ -3398,7 +3400,7 @@  void ceph_mdsc_release_dir_caps(struct ceph_mds_request *req)
 	dcaps = xchg(&req->r_dir_caps, 0);
 	if (dcaps) {
 		dout("releasing r_dir_caps=%s\n", ceph_cap_string(dcaps));
-		ceph_put_cap_refs(ceph_inode(req->r_parent), dcaps);
+		ceph_async_put_cap_refs(ceph_inode(req->r_parent), dcaps);
 	}
 }
 
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 226f19c..01d206f 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -421,6 +421,9 @@  struct ceph_inode_info {
 	struct timespec64 i_btime;
 	struct timespec64 i_snap_btime;
 
+	struct work_struct put_cap_refs_work;
+	int pending_put_caps;
+
 	struct work_struct i_work;
 	unsigned long  i_work_mask;
 
@@ -1095,6 +1098,8 @@  extern void ceph_take_cap_refs(struct ceph_inode_info *ci, int caps,
 				bool snap_rwsem_locked);
 extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps);
 extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had);
+extern void ceph_async_put_cap_refs(struct ceph_inode_info *ci, int had);
+extern void ceph_async_put_cap_refs_work(struct work_struct *work);
 extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
 				       struct ceph_snap_context *snapc);
 extern void ceph_flush_snaps(struct ceph_inode_info *ci,