diff mbox series

libceph: fix race between delayed_work() and ceph_monc_stop()

Message ID 20240709113848.336103-1-idryomov@gmail.com (mailing list archive)
State New, archived
Headers show
Series libceph: fix race between delayed_work() and ceph_monc_stop() | expand

Commit Message

Ilya Dryomov July 9, 2024, 11:38 a.m. UTC
The way the delayed work is handled in ceph_monc_stop() is prone to
races with mon_fault() and possibly also finish_hunting().  Both of
these can requeue the delayed work which wouldn't be canceled by any of
the following code in case that happens after cancel_delayed_work_sync()
runs -- __close_session() doesn't mess with the delayed work in order
to avoid interfering with the hunting interval logic.  This part was
missed in commit b5d91704f53e ("libceph: behave in mon_fault() if
cur_mon < 0") and use-after-free can still ensue on monc and objects
that hang off of it, with monc->auth and monc->monmap being
particularly susceptible to quickly being reused.

To fix this:

- clear monc->cur_mon and monc->hunting as part of closing the session
  in ceph_monc_stop()
- bail from delayed_work() if monc->cur_mon is cleared, similar to how
  it's done in mon_fault() and finish_hunting() (based on monc->hunting)
- call cancel_delayed_work_sync() after the session is closed

Cc: stable@vger.kernel.org
Link: https://tracker.ceph.com/issues/66857
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 net/ceph/mon_client.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

Comments

Xiubo Li July 10, 2024, 8:07 a.m. UTC | #1
On 7/9/24 19:38, Ilya Dryomov wrote:
> The way the delayed work is handled in ceph_monc_stop() is prone to
> races with mon_fault() and possibly also finish_hunting().  Both of
> these can requeue the delayed work which wouldn't be canceled by any of
> the following code in case that happens after cancel_delayed_work_sync()
> runs -- __close_session() doesn't mess with the delayed work in order
> to avoid interfering with the hunting interval logic.  This part was
> missed in commit b5d91704f53e ("libceph: behave in mon_fault() if
> cur_mon < 0") and use-after-free can still ensue on monc and objects
> that hang off of it, with monc->auth and monc->monmap being
> particularly susceptible to quickly being reused.
>
> To fix this:
>
> - clear monc->cur_mon and monc->hunting as part of closing the session
>    in ceph_monc_stop()
> - bail from delayed_work() if monc->cur_mon is cleared, similar to how
>    it's done in mon_fault() and finish_hunting() (based on monc->hunting)
> - call cancel_delayed_work_sync() after the session is closed
>
> Cc: stable@vger.kernel.org
> Link: https://tracker.ceph.com/issues/66857
> Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
> ---
>   net/ceph/mon_client.c | 14 ++++++++++++--
>   1 file changed, 12 insertions(+), 2 deletions(-)
>
> diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
> index f263f7e91a21..ab66b599ac47 100644
> --- a/net/ceph/mon_client.c
> +++ b/net/ceph/mon_client.c
> @@ -1085,13 +1085,19 @@ static void delayed_work(struct work_struct *work)
>   	struct ceph_mon_client *monc =
>   		container_of(work, struct ceph_mon_client, delayed_work.work);
>   
> -	dout("monc delayed_work\n");
>   	mutex_lock(&monc->mutex);
> +	dout("%s mon%d\n", __func__, monc->cur_mon);
> +	if (monc->cur_mon < 0) {
> +		goto out;
> +	}
> +
>   	if (monc->hunting) {
>   		dout("%s continuing hunt\n", __func__);
>   		reopen_session(monc);
>   	} else {
>   		int is_auth = ceph_auth_is_authenticated(monc->auth);
> +
> +		dout("%s is_authed %d\n", __func__, is_auth);
>   		if (ceph_con_keepalive_expired(&monc->con,
>   					       CEPH_MONC_PING_TIMEOUT)) {
>   			dout("monc keepalive timeout\n");
> @@ -1116,6 +1122,8 @@ static void delayed_work(struct work_struct *work)
>   		}
>   	}
>   	__schedule_delayed(monc);
> +
> +out:
>   	mutex_unlock(&monc->mutex);
>   }
>   
> @@ -1232,13 +1240,15 @@ EXPORT_SYMBOL(ceph_monc_init);
>   void ceph_monc_stop(struct ceph_mon_client *monc)
>   {
>   	dout("stop\n");
> -	cancel_delayed_work_sync(&monc->delayed_work);
>   
>   	mutex_lock(&monc->mutex);
>   	__close_session(monc);
> +	monc->hunting = false;
>   	monc->cur_mon = -1;
>   	mutex_unlock(&monc->mutex);
>   
> +	cancel_delayed_work_sync(&monc->delayed_work);
> +
>   	/*
>   	 * flush msgr queue before we destroy ourselves to ensure that:
>   	 *  - any work that references our embedded con is finished.

LGTM.

Reviewed-by: Xiubo Li <xiubli@redhat.com>
diff mbox series

Patch

diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index f263f7e91a21..ab66b599ac47 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -1085,13 +1085,19 @@  static void delayed_work(struct work_struct *work)
 	struct ceph_mon_client *monc =
 		container_of(work, struct ceph_mon_client, delayed_work.work);
 
-	dout("monc delayed_work\n");
 	mutex_lock(&monc->mutex);
+	dout("%s mon%d\n", __func__, monc->cur_mon);
+	if (monc->cur_mon < 0) {
+		goto out;
+	}
+
 	if (monc->hunting) {
 		dout("%s continuing hunt\n", __func__);
 		reopen_session(monc);
 	} else {
 		int is_auth = ceph_auth_is_authenticated(monc->auth);
+
+		dout("%s is_authed %d\n", __func__, is_auth);
 		if (ceph_con_keepalive_expired(&monc->con,
 					       CEPH_MONC_PING_TIMEOUT)) {
 			dout("monc keepalive timeout\n");
@@ -1116,6 +1122,8 @@  static void delayed_work(struct work_struct *work)
 		}
 	}
 	__schedule_delayed(monc);
+
+out:
 	mutex_unlock(&monc->mutex);
 }
 
@@ -1232,13 +1240,15 @@  EXPORT_SYMBOL(ceph_monc_init);
 void ceph_monc_stop(struct ceph_mon_client *monc)
 {
 	dout("stop\n");
-	cancel_delayed_work_sync(&monc->delayed_work);
 
 	mutex_lock(&monc->mutex);
 	__close_session(monc);
+	monc->hunting = false;
 	monc->cur_mon = -1;
 	mutex_unlock(&monc->mutex);
 
+	cancel_delayed_work_sync(&monc->delayed_work);
+
 	/*
 	 * flush msgr queue before we destroy ourselves to ensure that:
 	 *  - any work that references our embedded con is finished.