diff mbox series

[v18,67/71] libceph: defer removing the req from osdc just after req->r_callback

Message ID 20230412110930.176835-68-xiubli@redhat.com (mailing list archive)
State New, archived
Headers show
Series ceph+fscrypt: full support | expand

Commit Message

Xiubo Li April 12, 2023, 11:09 a.m. UTC
From: Xiubo Li <xiubli@redhat.com>

The sync_filesystem() will flush all the dirty buffer and submit the
osd reqs to the osdc and then is blocked to wait for all the reqs to
finish. But the when the reqs' replies come, the reqs will be removed
from osdc just before the req->r_callback()s are called. Which means
the sync_filesystem() will be woke up by leaving the req->r_callback()s
are still running.

This will be buggy when the waiter require the req->r_callback()s to
release some resources before continuing. So we need to make sure the
req->r_callback()s are called before removing the reqs from the osdc.

WARNING: CPU: 4 PID: 168846 at fs/crypto/keyring.c:242 fscrypt_destroy_keyring+0x7e/0xd0
CPU: 4 PID: 168846 Comm: umount Tainted: G S  6.1.0-rc5-ceph-g72ead199864c #1
Hardware name: Supermicro SYS-5018R-WR/X10SRW-F, BIOS 2.0 12/17/2015
RIP: 0010:fscrypt_destroy_keyring+0x7e/0xd0
RSP: 0018:ffffc9000b277e28 EFLAGS: 00010202
RAX: 0000000000000002 RBX: ffff88810d52ac00 RCX: ffff88810b56aa00
RDX: 0000000080000000 RSI: ffffffff822f3a09 RDI: ffff888108f59000
RBP: ffff8881d394fb88 R08: 0000000000000028 R09: 0000000000000000
R10: 0000000000000001 R11: 11ff4fe6834fcd91 R12: ffff8881d394fc40
R13: ffff888108f59000 R14: ffff8881d394f800 R15: 0000000000000000
FS:  00007fd83f6f1080(0000) GS:ffff88885fd00000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f918d417000 CR3: 000000017f89a005 CR4: 00000000003706e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
<TASK>
generic_shutdown_super+0x47/0x120
kill_anon_super+0x14/0x30
ceph_kill_sb+0x36/0x90 [ceph]
deactivate_locked_super+0x29/0x60
cleanup_mnt+0xb8/0x140
task_work_run+0x67/0xb0
exit_to_user_mode_prepare+0x23d/0x240
syscall_exit_to_user_mode+0x25/0x60
do_syscall_64+0x40/0x80
entry_SYSCALL_64_after_hwframe+0x63/0xcd
RIP: 0033:0x7fd83dc39e9b

URL: https://tracker.ceph.com/issues/58126
Tested-by: Luís Henriques <lhenriques@suse.de>
Tested-by: Venky Shankar <vshankar@redhat.com>
Reviewed-by: Luís Henriques <lhenriques@suse.de>
Signed-off-by: Xiubo Li <xiubli@redhat.com>
---
 net/ceph/osd_client.c | 43 +++++++++++++++++++++++++++++++++++--------
 1 file changed, 35 insertions(+), 8 deletions(-)

Comments

Ilya Dryomov April 16, 2023, 7:49 p.m. UTC | #1
On Wed, Apr 12, 2023 at 1:15 PM <xiubli@redhat.com> wrote:
>
> From: Xiubo Li <xiubli@redhat.com>
>
> The sync_filesystem() will flush all the dirty buffer and submit the
> osd reqs to the osdc and then is blocked to wait for all the reqs to
> finish. But the when the reqs' replies come, the reqs will be removed
> from osdc just before the req->r_callback()s are called. Which means
> the sync_filesystem() will be woke up by leaving the req->r_callback()s
> are still running.
>
> This will be buggy when the waiter require the req->r_callback()s to
> release some resources before continuing. So we need to make sure the
> req->r_callback()s are called before removing the reqs from the osdc.
>
> WARNING: CPU: 4 PID: 168846 at fs/crypto/keyring.c:242 fscrypt_destroy_keyring+0x7e/0xd0
> CPU: 4 PID: 168846 Comm: umount Tainted: G S  6.1.0-rc5-ceph-g72ead199864c #1
> Hardware name: Supermicro SYS-5018R-WR/X10SRW-F, BIOS 2.0 12/17/2015
> RIP: 0010:fscrypt_destroy_keyring+0x7e/0xd0
> RSP: 0018:ffffc9000b277e28 EFLAGS: 00010202
> RAX: 0000000000000002 RBX: ffff88810d52ac00 RCX: ffff88810b56aa00
> RDX: 0000000080000000 RSI: ffffffff822f3a09 RDI: ffff888108f59000
> RBP: ffff8881d394fb88 R08: 0000000000000028 R09: 0000000000000000
> R10: 0000000000000001 R11: 11ff4fe6834fcd91 R12: ffff8881d394fc40
> R13: ffff888108f59000 R14: ffff8881d394f800 R15: 0000000000000000
> FS:  00007fd83f6f1080(0000) GS:ffff88885fd00000(0000) knlGS:0000000000000000
> CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: 00007f918d417000 CR3: 000000017f89a005 CR4: 00000000003706e0
> DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> Call Trace:
> <TASK>
> generic_shutdown_super+0x47/0x120
> kill_anon_super+0x14/0x30
> ceph_kill_sb+0x36/0x90 [ceph]
> deactivate_locked_super+0x29/0x60
> cleanup_mnt+0xb8/0x140
> task_work_run+0x67/0xb0
> exit_to_user_mode_prepare+0x23d/0x240
> syscall_exit_to_user_mode+0x25/0x60
> do_syscall_64+0x40/0x80
> entry_SYSCALL_64_after_hwframe+0x63/0xcd
> RIP: 0033:0x7fd83dc39e9b
>
> URL: https://tracker.ceph.com/issues/58126
> Tested-by: Luís Henriques <lhenriques@suse.de>
> Tested-by: Venky Shankar <vshankar@redhat.com>
> Reviewed-by: Luís Henriques <lhenriques@suse.de>
> Signed-off-by: Xiubo Li <xiubli@redhat.com>
> ---
>  net/ceph/osd_client.c | 43 +++++++++++++++++++++++++++++++++++--------
>  1 file changed, 35 insertions(+), 8 deletions(-)
>
> diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
> index 78b622178a3d..db3d93d3e692 100644
> --- a/net/ceph/osd_client.c
> +++ b/net/ceph/osd_client.c
> @@ -2507,7 +2507,7 @@ static void submit_request(struct ceph_osd_request *req, bool wrlocked)
>         __submit_request(req, wrlocked);
>  }
>
> -static void finish_request(struct ceph_osd_request *req)
> +static void __finish_request(struct ceph_osd_request *req)
>  {
>         struct ceph_osd_client *osdc = req->r_osdc;
>
> @@ -2516,12 +2516,6 @@ static void finish_request(struct ceph_osd_request *req)
>
>         req->r_end_latency = ktime_get();
>
> -       if (req->r_osd) {
> -               ceph_init_sparse_read(&req->r_osd->o_sparse_read);
> -               unlink_request(req->r_osd, req);
> -       }
> -       atomic_dec(&osdc->num_requests);
> -
>         /*
>          * If an OSD has failed or returned and a request has been sent
>          * twice, it's possible to get a reply and end up here while the
> @@ -2532,13 +2526,46 @@ static void finish_request(struct ceph_osd_request *req)
>         ceph_msg_revoke_incoming(req->r_reply);
>  }
>
> +static void __remove_request(struct ceph_osd_request *req)
> +{
> +       struct ceph_osd_client *osdc = req->r_osdc;
> +
> +       dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
> +
> +       if (req->r_osd) {
> +               ceph_init_sparse_read(&req->r_osd->o_sparse_read);
> +               unlink_request(req->r_osd, req);
> +       }
> +       atomic_dec(&osdc->num_requests);
> +}
> +
> +static void finish_request(struct ceph_osd_request *req)
> +{
> +       __finish_request(req);
> +       __remove_request(req);
> +}
> +
>  static void __complete_request(struct ceph_osd_request *req)
>  {
> +       struct ceph_osd_client *osdc = req->r_osdc;
> +       struct ceph_osd *osd = req->r_osd;
> +
>         dout("%s req %p tid %llu cb %ps result %d\n", __func__, req,
>              req->r_tid, req->r_callback, req->r_result);
>
>         if (req->r_callback)
>                 req->r_callback(req);
> +
> +       down_read(&osdc->lock);
> +       if (osd) {
> +               mutex_lock(&osd->lock);
> +               __remove_request(req);
> +               mutex_unlock(&osd->lock);
> +       } else {
> +               atomic_dec(&osdc->num_requests);
> +       }
> +       up_read(&osdc->lock);

Hi Xiubo,

I think it was highlighted before that this patch is wrong so I'm
surprised to see it in this series, broken out and with a different
subject.

On top of changing long-standing behavior which is purposefully
consistent with Objecter behavior in userspace, it also seems to
introduce a race condition with ceph_osdc_cancel_request() which can
lead, among other things, to a use-after-free on req.  Consider what
would happen if the request is cancelled while the callback is running:
since it would still be linked at that point, cancel_request() would be
allowed to go through and eventually put the messenger's reference.

Thanks,

                Ilya
Xiubo Li April 17, 2023, 1:08 a.m. UTC | #2
On 4/17/23 03:49, Ilya Dryomov wrote:
> On Wed, Apr 12, 2023 at 1:15 PM <xiubli@redhat.com> wrote:
>> From: Xiubo Li <xiubli@redhat.com>
>>
>> The sync_filesystem() will flush all the dirty buffer and submit the
>> osd reqs to the osdc and then is blocked to wait for all the reqs to
>> finish. But the when the reqs' replies come, the reqs will be removed
>> from osdc just before the req->r_callback()s are called. Which means
>> the sync_filesystem() will be woke up by leaving the req->r_callback()s
>> are still running.
>>
>> This will be buggy when the waiter require the req->r_callback()s to
>> release some resources before continuing. So we need to make sure the
>> req->r_callback()s are called before removing the reqs from the osdc.
>>
>> WARNING: CPU: 4 PID: 168846 at fs/crypto/keyring.c:242 fscrypt_destroy_keyring+0x7e/0xd0
>> CPU: 4 PID: 168846 Comm: umount Tainted: G S  6.1.0-rc5-ceph-g72ead199864c #1
>> Hardware name: Supermicro SYS-5018R-WR/X10SRW-F, BIOS 2.0 12/17/2015
>> RIP: 0010:fscrypt_destroy_keyring+0x7e/0xd0
>> RSP: 0018:ffffc9000b277e28 EFLAGS: 00010202
>> RAX: 0000000000000002 RBX: ffff88810d52ac00 RCX: ffff88810b56aa00
>> RDX: 0000000080000000 RSI: ffffffff822f3a09 RDI: ffff888108f59000
>> RBP: ffff8881d394fb88 R08: 0000000000000028 R09: 0000000000000000
>> R10: 0000000000000001 R11: 11ff4fe6834fcd91 R12: ffff8881d394fc40
>> R13: ffff888108f59000 R14: ffff8881d394f800 R15: 0000000000000000
>> FS:  00007fd83f6f1080(0000) GS:ffff88885fd00000(0000) knlGS:0000000000000000
>> CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>> CR2: 00007f918d417000 CR3: 000000017f89a005 CR4: 00000000003706e0
>> DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
>> DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
>> Call Trace:
>> <TASK>
>> generic_shutdown_super+0x47/0x120
>> kill_anon_super+0x14/0x30
>> ceph_kill_sb+0x36/0x90 [ceph]
>> deactivate_locked_super+0x29/0x60
>> cleanup_mnt+0xb8/0x140
>> task_work_run+0x67/0xb0
>> exit_to_user_mode_prepare+0x23d/0x240
>> syscall_exit_to_user_mode+0x25/0x60
>> do_syscall_64+0x40/0x80
>> entry_SYSCALL_64_after_hwframe+0x63/0xcd
>> RIP: 0033:0x7fd83dc39e9b
>>
>> URL: https://tracker.ceph.com/issues/58126
>> Tested-by: Luís Henriques <lhenriques@suse.de>
>> Tested-by: Venky Shankar <vshankar@redhat.com>
>> Reviewed-by: Luís Henriques <lhenriques@suse.de>
>> Signed-off-by: Xiubo Li <xiubli@redhat.com>
>> ---
>>   net/ceph/osd_client.c | 43 +++++++++++++++++++++++++++++++++++--------
>>   1 file changed, 35 insertions(+), 8 deletions(-)
>>
>> diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
>> index 78b622178a3d..db3d93d3e692 100644
>> --- a/net/ceph/osd_client.c
>> +++ b/net/ceph/osd_client.c
>> @@ -2507,7 +2507,7 @@ static void submit_request(struct ceph_osd_request *req, bool wrlocked)
>>          __submit_request(req, wrlocked);
>>   }
>>
>> -static void finish_request(struct ceph_osd_request *req)
>> +static void __finish_request(struct ceph_osd_request *req)
>>   {
>>          struct ceph_osd_client *osdc = req->r_osdc;
>>
>> @@ -2516,12 +2516,6 @@ static void finish_request(struct ceph_osd_request *req)
>>
>>          req->r_end_latency = ktime_get();
>>
>> -       if (req->r_osd) {
>> -               ceph_init_sparse_read(&req->r_osd->o_sparse_read);
>> -               unlink_request(req->r_osd, req);
>> -       }
>> -       atomic_dec(&osdc->num_requests);
>> -
>>          /*
>>           * If an OSD has failed or returned and a request has been sent
>>           * twice, it's possible to get a reply and end up here while the
>> @@ -2532,13 +2526,46 @@ static void finish_request(struct ceph_osd_request *req)
>>          ceph_msg_revoke_incoming(req->r_reply);
>>   }
>>
>> +static void __remove_request(struct ceph_osd_request *req)
>> +{
>> +       struct ceph_osd_client *osdc = req->r_osdc;
>> +
>> +       dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
>> +
>> +       if (req->r_osd) {
>> +               ceph_init_sparse_read(&req->r_osd->o_sparse_read);
>> +               unlink_request(req->r_osd, req);
>> +       }
>> +       atomic_dec(&osdc->num_requests);
>> +}
>> +
>> +static void finish_request(struct ceph_osd_request *req)
>> +{
>> +       __finish_request(req);
>> +       __remove_request(req);
>> +}
>> +
>>   static void __complete_request(struct ceph_osd_request *req)
>>   {
>> +       struct ceph_osd_client *osdc = req->r_osdc;
>> +       struct ceph_osd *osd = req->r_osd;
>> +
>>          dout("%s req %p tid %llu cb %ps result %d\n", __func__, req,
>>               req->r_tid, req->r_callback, req->r_result);
>>
>>          if (req->r_callback)
>>                  req->r_callback(req);
>> +
>> +       down_read(&osdc->lock);
>> +       if (osd) {
>> +               mutex_lock(&osd->lock);
>> +               __remove_request(req);
>> +               mutex_unlock(&osd->lock);
>> +       } else {
>> +               atomic_dec(&osdc->num_requests);
>> +       }
>> +       up_read(&osdc->lock);
> Hi Xiubo,
>
> I think it was highlighted before that this patch is wrong so I'm
> surprised to see it in this series, broken out and with a different
> subject.
>
> On top of changing long-standing behavior which is purposefully
> consistent with Objecter behavior in userspace, it also seems to
> introduce a race condition with ceph_osdc_cancel_request() which can
> lead, among other things, to a use-after-free on req.  Consider what
> would happen if the request is cancelled while the callback is running:
> since it would still be linked at that point, cancel_request() would be
> allowed to go through and eventually put the messenger's reference.

Okay, I may miss or misunderstand it in another thread.

Since the bug has been fixed by the following [68/71] patch, let's 
remove this.

Thanks

- Xiubo

> Thanks,
>
>                  Ilya
>
diff mbox series

Patch

diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 78b622178a3d..db3d93d3e692 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -2507,7 +2507,7 @@  static void submit_request(struct ceph_osd_request *req, bool wrlocked)
 	__submit_request(req, wrlocked);
 }
 
-static void finish_request(struct ceph_osd_request *req)
+static void __finish_request(struct ceph_osd_request *req)
 {
 	struct ceph_osd_client *osdc = req->r_osdc;
 
@@ -2516,12 +2516,6 @@  static void finish_request(struct ceph_osd_request *req)
 
 	req->r_end_latency = ktime_get();
 
-	if (req->r_osd) {
-		ceph_init_sparse_read(&req->r_osd->o_sparse_read);
-		unlink_request(req->r_osd, req);
-	}
-	atomic_dec(&osdc->num_requests);
-
 	/*
 	 * If an OSD has failed or returned and a request has been sent
 	 * twice, it's possible to get a reply and end up here while the
@@ -2532,13 +2526,46 @@  static void finish_request(struct ceph_osd_request *req)
 	ceph_msg_revoke_incoming(req->r_reply);
 }
 
+static void __remove_request(struct ceph_osd_request *req)
+{
+	struct ceph_osd_client *osdc = req->r_osdc;
+
+	dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
+
+	if (req->r_osd) {
+		ceph_init_sparse_read(&req->r_osd->o_sparse_read);
+		unlink_request(req->r_osd, req);
+	}
+	atomic_dec(&osdc->num_requests);
+}
+
+static void finish_request(struct ceph_osd_request *req)
+{
+	__finish_request(req);
+	__remove_request(req);
+}
+
 static void __complete_request(struct ceph_osd_request *req)
 {
+	struct ceph_osd_client *osdc = req->r_osdc;
+	struct ceph_osd *osd = req->r_osd;
+
 	dout("%s req %p tid %llu cb %ps result %d\n", __func__, req,
 	     req->r_tid, req->r_callback, req->r_result);
 
 	if (req->r_callback)
 		req->r_callback(req);
+
+	down_read(&osdc->lock);
+	if (osd) {
+		mutex_lock(&osd->lock);
+		__remove_request(req);
+		mutex_unlock(&osd->lock);
+	} else {
+		atomic_dec(&osdc->num_requests);
+	}
+	up_read(&osdc->lock);
+
 	complete_all(&req->r_completion);
 	ceph_osdc_put_request(req);
 }
@@ -3873,7 +3900,7 @@  static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg)
 	WARN_ON(!(m.flags & CEPH_OSD_FLAG_ONDISK));
 	req->r_version = m.user_version;
 	req->r_result = m.result ?: data_len;
-	finish_request(req);
+	__finish_request(req);
 	mutex_unlock(&osd->lock);
 	up_read(&osdc->lock);