Message ID | 1390302667-8729-4-git-send-email-ilya.dryomov@inktank.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Tue, 21 Jan 2014, Ilya Dryomov wrote: > Keepalive mechanism that we are currently using doesn't handle dead > (e.g. half-open in RFC 793 sense) TCP connections: a) it's based on > pending ceph_osd_requests which are not necessarily present, and b) > keepalive byte is only sent if connection is in CON_STATE_OPEN state > because of protocol restrictions. This does not cover connection > handshake and negotiation stages and can lead to kernel client hanging > forever. Fix it by forcibly resetting the connection if after two > mount_timeouts we still haven't transitioned to CON_STATE_OPEN. > > Fixes: http://tracker.ceph.com/issues/7139 > > Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com> > --- > include/linux/ceph/messenger.h | 5 +++++ > net/ceph/ceph_common.c | 1 + > net/ceph/messenger.c | 42 ++++++++++++++++++++++++++++++++++++++-- > 3 files changed, 46 insertions(+), 2 deletions(-) > > diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h > index 20ee8b63a968..25d3ec8bcdde 100644 > --- a/include/linux/ceph/messenger.h > +++ b/include/linux/ceph/messenger.h > @@ -62,6 +62,8 @@ struct ceph_messenger { > > u64 supported_features; > u64 required_features; > + > + unsigned long connect_timeout; /* jiffies */ > }; > > enum ceph_msg_data_type { > @@ -240,6 +242,8 @@ struct ceph_connection { > > struct delayed_work work; /* send|recv work */ > unsigned long delay; /* current delay interval */ > + > + struct delayed_work connect_timeout_work; > }; > > > @@ -257,6 +261,7 @@ extern void ceph_messenger_init(struct ceph_messenger *msgr, > struct ceph_entity_addr *myaddr, > u64 supported_features, > u64 required_features, > + unsigned long connect_timeout, > bool nocrc); > > extern void ceph_con_init(struct ceph_connection *con, void *private, > diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c > index 67d7721d237e..bc41e5a392a4 100644 > --- a/net/ceph/ceph_common.c > +++ b/net/ceph/ceph_common.c > @@ -511,6 +511,7 @@ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private, > ceph_messenger_init(&client->msgr, myaddr, > client->supported_features, > client->required_features, > + 2 * client->options->mount_timeout * HZ, Any specific reason for 2x? I would like 1/2 if I had to pull a number out of a hat. OTOH, mount_timeout could be 0, too, so it might make more sense to make this a separate mount option entirely. Either that, or just make this a separate hard-coded value that is not related to mount_timeout. > ceph_test_opt(client, NOCRC)); > > /* subsystems */ > diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c > index c7e0143d24f1..65dea6c8c6f8 100644 > --- a/net/ceph/messenger.c > +++ b/net/ceph/messenger.c > @@ -385,6 +385,16 @@ static void con_fault_raise(struct ceph_connection *con) > queue_con(con); > } > > +static void handle_connect_timeout(struct work_struct *work) > +{ > + struct ceph_connection *con = container_of(work, > + struct ceph_connection, > + connect_timeout_work.work); > + > + dout("%s connect timeout\n", __func__); > + con_fault_raise(con); > +} > + > > /* > * socket callback functions > @@ -447,6 +457,18 @@ static void ceph_sock_state_change(struct sock *sk) > case TCP_ESTABLISHED: > dout("%s TCP_ESTABLISHED\n", __func__); > con_sock_state_connected(con); > + /* > + * If not told otherwise, arm connect_timeout hammer to > + * be able to deal with half-open (in RFC 793 sense) > + * sockets in the interim before we transition to > + * CON_STATE_OPEN and regular request keepalive > + * mechanism (ceph_con_keepalive()) kicks in. > + */ > + if (con->msgr->connect_timeout) { > + dout("arming connect_timeout hammer\n"); > + schedule_delayed_work(&con->connect_timeout_work, > + con->msgr->connect_timeout); > + } > queue_con(con); > break; > default: /* Everything else is uninteresting */ > @@ -586,6 +608,13 @@ static int con_close_socket(struct ceph_connection *con) > int rc = 0; > > dout("con_close_socket on %p sock %p\n", con, con->sock); > + > + /* > + * Sync with con_fault_raise() in handle_connect_timeout() to > + * make sure that SOCK_CLOSED is going to be cleared for good. > + */ > + cancel_delayed_work_sync(&con->connect_timeout_work); At first I thought there would be locking problems here, but AFAICS there is no real difference between this timer firing and a normal state change callback from the socket. Aside from the timeout nit, this looks right! sage > + > if (con->sock) { > rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR); > sock_release(con->sock); > @@ -595,8 +624,8 @@ static int con_close_socket(struct ceph_connection *con) > /* > * Forcibly clear the SOCK_CLOSED flag. It gets set > * independent of the connection mutex, and we could have > - * received a socket close event before we had the chance to > - * shut the socket down. > + * received a socket close and/or connect_timeout event > + * before we had the chance to shut the socket down. > */ > con_flag_clear(con, CON_FLAG_SOCK_CLOSED); > > @@ -725,6 +754,7 @@ void ceph_con_init(struct ceph_connection *con, void *private, > INIT_LIST_HEAD(&con->out_queue); > INIT_LIST_HEAD(&con->out_sent); > INIT_DELAYED_WORK(&con->work, con_work); > + INIT_DELAYED_WORK(&con->connect_timeout_work, handle_connect_timeout); > > con->state = CON_STATE_CLOSED; > } > @@ -2076,6 +2106,7 @@ static int process_connect(struct ceph_connection *con) > > WARN_ON(con->state != CON_STATE_NEGOTIATING); > con->state = CON_STATE_OPEN; > + > con->auth_retry = 0; /* we authenticated; clear flag */ > con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq); > con->connect_seq++; > @@ -2092,6 +2123,11 @@ static int process_connect(struct ceph_connection *con) > > con->delay = 0; /* reset backoff memory */ > > + if (con->msgr->connect_timeout) { > + dout("disarming connect_timeout hammer\n"); > + cancel_delayed_work(&con->connect_timeout_work); > + } > + > if (con->in_reply.tag == CEPH_MSGR_TAG_SEQ) { > prepare_write_seq(con); > prepare_read_seq(con); > @@ -2866,10 +2902,12 @@ void ceph_messenger_init(struct ceph_messenger *msgr, > struct ceph_entity_addr *myaddr, > u64 supported_features, > u64 required_features, > + unsigned long connect_timeout, > bool nocrc) > { > msgr->supported_features = supported_features; > msgr->required_features = required_features; > + msgr->connect_timeout = connect_timeout; > > spin_lock_init(&msgr->global_seq_lock); > > -- > 1.7.10.4 > > -- > To unsubscribe from this list: send the line "unsubscribe ceph-devel" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > > -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sat, Jan 25, 2014 at 1:07 AM, Sage Weil <sage@inktank.com> wrote: > On Tue, 21 Jan 2014, Ilya Dryomov wrote: >> Keepalive mechanism that we are currently using doesn't handle dead >> (e.g. half-open in RFC 793 sense) TCP connections: a) it's based on >> pending ceph_osd_requests which are not necessarily present, and b) >> keepalive byte is only sent if connection is in CON_STATE_OPEN state >> because of protocol restrictions. This does not cover connection >> handshake and negotiation stages and can lead to kernel client hanging >> forever. Fix it by forcibly resetting the connection if after two >> mount_timeouts we still haven't transitioned to CON_STATE_OPEN. >> >> Fixes: http://tracker.ceph.com/issues/7139 >> >> Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com> >> --- >> include/linux/ceph/messenger.h | 5 +++++ >> net/ceph/ceph_common.c | 1 + >> net/ceph/messenger.c | 42 ++++++++++++++++++++++++++++++++++++++-- >> 3 files changed, 46 insertions(+), 2 deletions(-) >> >> diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h >> index 20ee8b63a968..25d3ec8bcdde 100644 >> --- a/include/linux/ceph/messenger.h >> +++ b/include/linux/ceph/messenger.h >> @@ -62,6 +62,8 @@ struct ceph_messenger { >> >> u64 supported_features; >> u64 required_features; >> + >> + unsigned long connect_timeout; /* jiffies */ >> }; >> >> enum ceph_msg_data_type { >> @@ -240,6 +242,8 @@ struct ceph_connection { >> >> struct delayed_work work; /* send|recv work */ >> unsigned long delay; /* current delay interval */ >> + >> + struct delayed_work connect_timeout_work; >> }; >> >> >> @@ -257,6 +261,7 @@ extern void ceph_messenger_init(struct ceph_messenger *msgr, >> struct ceph_entity_addr *myaddr, >> u64 supported_features, >> u64 required_features, >> + unsigned long connect_timeout, >> bool nocrc); >> >> extern void ceph_con_init(struct ceph_connection *con, void *private, >> diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c >> index 67d7721d237e..bc41e5a392a4 100644 >> --- a/net/ceph/ceph_common.c >> +++ b/net/ceph/ceph_common.c >> @@ -511,6 +511,7 @@ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private, >> ceph_messenger_init(&client->msgr, myaddr, >> client->supported_features, >> client->required_features, >> + 2 * client->options->mount_timeout * HZ, > > Any specific reason for 2x? I would like 1/2 if I had to pull a number > out of a hat. OTOH, mount_timeout could be 0, too, so it might make more > sense to make this a separate mount option entirely. Either that, or just > make this a separate hard-coded value that is not related to > mount_timeout. Yes, my logic for 2x was basically to not impose anything on the rest the system. Apart from some cephfs-specific stuff, mount_timeout is used in __ceph_open_session, where 1/2 would theoretically interfere with the wait-for-maps loop by virtue of shutting down the socket too early. On mount_timeout=0 connect_timeout hammer isn't armed. This was useful in testing, and it doesn't seem to change existing semantics at all. On the contrary, as it is, mount_timeout == 0 is a possible infinite loop anyway, so not arming connect_timeout (i.e. not defending against another endless hang) maps onto it quite nicely ;) Thanks, Ilya -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
I've looked into this some more and I now think connect_timeout approach not the best way to fix this. I think a better way is to go through all connection and negotiation waits on both rbd and cephfs sides and make sure they are interruptible and have a timeout attached. Only if that turns out to be not feasible should we go with this connect_timeout bolt-on. Thanks, Ilya -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sun, 26 Jan 2014, Ilya Dryomov wrote: > On Sat, Jan 25, 2014 at 1:07 AM, Sage Weil <sage@inktank.com> wrote: > > On Tue, 21 Jan 2014, Ilya Dryomov wrote: > >> Keepalive mechanism that we are currently using doesn't handle dead > >> (e.g. half-open in RFC 793 sense) TCP connections: a) it's based on > >> pending ceph_osd_requests which are not necessarily present, and b) > >> keepalive byte is only sent if connection is in CON_STATE_OPEN state > >> because of protocol restrictions. This does not cover connection > >> handshake and negotiation stages and can lead to kernel client hanging > >> forever. Fix it by forcibly resetting the connection if after two > >> mount_timeouts we still haven't transitioned to CON_STATE_OPEN. > >> > >> Fixes: http://tracker.ceph.com/issues/7139 > >> > >> Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com> > >> --- > >> include/linux/ceph/messenger.h | 5 +++++ > >> net/ceph/ceph_common.c | 1 + > >> net/ceph/messenger.c | 42 ++++++++++++++++++++++++++++++++++++++-- > >> 3 files changed, 46 insertions(+), 2 deletions(-) > >> > >> diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h > >> index 20ee8b63a968..25d3ec8bcdde 100644 > >> --- a/include/linux/ceph/messenger.h > >> +++ b/include/linux/ceph/messenger.h > >> @@ -62,6 +62,8 @@ struct ceph_messenger { > >> > >> u64 supported_features; > >> u64 required_features; > >> + > >> + unsigned long connect_timeout; /* jiffies */ > >> }; > >> > >> enum ceph_msg_data_type { > >> @@ -240,6 +242,8 @@ struct ceph_connection { > >> > >> struct delayed_work work; /* send|recv work */ > >> unsigned long delay; /* current delay interval */ > >> + > >> + struct delayed_work connect_timeout_work; > >> }; > >> > >> > >> @@ -257,6 +261,7 @@ extern void ceph_messenger_init(struct ceph_messenger *msgr, > >> struct ceph_entity_addr *myaddr, > >> u64 supported_features, > >> u64 required_features, > >> + unsigned long connect_timeout, > >> bool nocrc); > >> > >> extern void ceph_con_init(struct ceph_connection *con, void *private, > >> diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c > >> index 67d7721d237e..bc41e5a392a4 100644 > >> --- a/net/ceph/ceph_common.c > >> +++ b/net/ceph/ceph_common.c > >> @@ -511,6 +511,7 @@ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private, > >> ceph_messenger_init(&client->msgr, myaddr, > >> client->supported_features, > >> client->required_features, > >> + 2 * client->options->mount_timeout * HZ, > > > > Any specific reason for 2x? I would like 1/2 if I had to pull a number > > out of a hat. OTOH, mount_timeout could be 0, too, so it might make more > > sense to make this a separate mount option entirely. Either that, or just > > make this a separate hard-coded value that is not related to > > mount_timeout. > > Yes, my logic for 2x was basically to not impose anything on the rest > the system. Apart from some cephfs-specific stuff, mount_timeout is > used in __ceph_open_session, where 1/2 would theoretically interfere > with the wait-for-maps loop by virtue of shutting down the socket too > early. If this is essentially manifesting as a socket reset/disconnect, then having it at 1/2 should trigger a ceph_fault and then reconnect within the messenger layer and give us a second chance before the mount_timeout expires. At least, that's the way I'm reading the con_work: if ((fault = con_sock_closed(con))) { dout("%s: con %p SOCK_CLOSED\n", __func__, con); break; } ... if (fault) con_fault(con); I still lean toward a separately defined timeout here... > On mount_timeout=0 connect_timeout hammer isn't armed. This was useful > in testing, and it doesn't seem to change existing semantics at all. > On the contrary, as it is, mount_timeout == 0 is a possible infinite > loop anyway, so not arming connect_timeout (i.e. not defending against > another endless hang) maps onto it quite nicely ;) Sounds good. sage -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 20ee8b63a968..25d3ec8bcdde 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -62,6 +62,8 @@ struct ceph_messenger { u64 supported_features; u64 required_features; + + unsigned long connect_timeout; /* jiffies */ }; enum ceph_msg_data_type { @@ -240,6 +242,8 @@ struct ceph_connection { struct delayed_work work; /* send|recv work */ unsigned long delay; /* current delay interval */ + + struct delayed_work connect_timeout_work; }; @@ -257,6 +261,7 @@ extern void ceph_messenger_init(struct ceph_messenger *msgr, struct ceph_entity_addr *myaddr, u64 supported_features, u64 required_features, + unsigned long connect_timeout, bool nocrc); extern void ceph_con_init(struct ceph_connection *con, void *private, diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 67d7721d237e..bc41e5a392a4 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -511,6 +511,7 @@ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private, ceph_messenger_init(&client->msgr, myaddr, client->supported_features, client->required_features, + 2 * client->options->mount_timeout * HZ, ceph_test_opt(client, NOCRC)); /* subsystems */ diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index c7e0143d24f1..65dea6c8c6f8 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -385,6 +385,16 @@ static void con_fault_raise(struct ceph_connection *con) queue_con(con); } +static void handle_connect_timeout(struct work_struct *work) +{ + struct ceph_connection *con = container_of(work, + struct ceph_connection, + connect_timeout_work.work); + + dout("%s connect timeout\n", __func__); + con_fault_raise(con); +} + /* * socket callback functions @@ -447,6 +457,18 @@ static void ceph_sock_state_change(struct sock *sk) case TCP_ESTABLISHED: dout("%s TCP_ESTABLISHED\n", __func__); con_sock_state_connected(con); + /* + * If not told otherwise, arm connect_timeout hammer to + * be able to deal with half-open (in RFC 793 sense) + * sockets in the interim before we transition to + * CON_STATE_OPEN and regular request keepalive + * mechanism (ceph_con_keepalive()) kicks in. + */ + if (con->msgr->connect_timeout) { + dout("arming connect_timeout hammer\n"); + schedule_delayed_work(&con->connect_timeout_work, + con->msgr->connect_timeout); + } queue_con(con); break; default: /* Everything else is uninteresting */ @@ -586,6 +608,13 @@ static int con_close_socket(struct ceph_connection *con) int rc = 0; dout("con_close_socket on %p sock %p\n", con, con->sock); + + /* + * Sync with con_fault_raise() in handle_connect_timeout() to + * make sure that SOCK_CLOSED is going to be cleared for good. + */ + cancel_delayed_work_sync(&con->connect_timeout_work); + if (con->sock) { rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR); sock_release(con->sock); @@ -595,8 +624,8 @@ static int con_close_socket(struct ceph_connection *con) /* * Forcibly clear the SOCK_CLOSED flag. It gets set * independent of the connection mutex, and we could have - * received a socket close event before we had the chance to - * shut the socket down. + * received a socket close and/or connect_timeout event + * before we had the chance to shut the socket down. */ con_flag_clear(con, CON_FLAG_SOCK_CLOSED); @@ -725,6 +754,7 @@ void ceph_con_init(struct ceph_connection *con, void *private, INIT_LIST_HEAD(&con->out_queue); INIT_LIST_HEAD(&con->out_sent); INIT_DELAYED_WORK(&con->work, con_work); + INIT_DELAYED_WORK(&con->connect_timeout_work, handle_connect_timeout); con->state = CON_STATE_CLOSED; } @@ -2076,6 +2106,7 @@ static int process_connect(struct ceph_connection *con) WARN_ON(con->state != CON_STATE_NEGOTIATING); con->state = CON_STATE_OPEN; + con->auth_retry = 0; /* we authenticated; clear flag */ con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq); con->connect_seq++; @@ -2092,6 +2123,11 @@ static int process_connect(struct ceph_connection *con) con->delay = 0; /* reset backoff memory */ + if (con->msgr->connect_timeout) { + dout("disarming connect_timeout hammer\n"); + cancel_delayed_work(&con->connect_timeout_work); + } + if (con->in_reply.tag == CEPH_MSGR_TAG_SEQ) { prepare_write_seq(con); prepare_read_seq(con); @@ -2866,10 +2902,12 @@ void ceph_messenger_init(struct ceph_messenger *msgr, struct ceph_entity_addr *myaddr, u64 supported_features, u64 required_features, + unsigned long connect_timeout, bool nocrc) { msgr->supported_features = supported_features; msgr->required_features = required_features; + msgr->connect_timeout = connect_timeout; spin_lock_init(&msgr->global_seq_lock);
Keepalive mechanism that we are currently using doesn't handle dead (e.g. half-open in RFC 793 sense) TCP connections: a) it's based on pending ceph_osd_requests which are not necessarily present, and b) keepalive byte is only sent if connection is in CON_STATE_OPEN state because of protocol restrictions. This does not cover connection handshake and negotiation stages and can lead to kernel client hanging forever. Fix it by forcibly resetting the connection if after two mount_timeouts we still haven't transitioned to CON_STATE_OPEN. Fixes: http://tracker.ceph.com/issues/7139 Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com> --- include/linux/ceph/messenger.h | 5 +++++ net/ceph/ceph_common.c | 1 + net/ceph/messenger.c | 42 ++++++++++++++++++++++++++++++++++++++-- 3 files changed, 46 insertions(+), 2 deletions(-)