Message ID | 20240118105047.792879-4-xiubli@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | libceph: fix sparse-read failure bug | expand |
On Thu, 2024-01-18 at 18:50 +0800, xiubli@redhat.com wrote: > From: Xiubo Li <xiubli@redhat.com> > > The messages from ceph maybe split into multiple socket packages > and we just need to wait for all the data to be availiable on the > sokcet. > > This will add 'sr_total_resid' to record the total length for all > data items for sparse-read message and 'sr_resid_elen' to record > the current extent total length. > It's been a while since I was in the ceph messenger code, and my v1 memory is especially fuzzy. I really don't quite understand how tracking yet another length field solves the stated problem. I'd really appreciate a description of the problem that you saw and how this solves it. The ceph bug is not very straightforward. I get that we need to wait for a certain amount of data to be available on the socket before we drive the sparse_read op, but I don't quite see how that's being achieved here. Also, why is this not a problem on messenger v2? > URL: https://tracker.ceph.com/issues/63586 > Signed-off-by: Xiubo Li <xiubli@redhat.com> > --- > include/linux/ceph/messenger.h | 1 + > net/ceph/messenger_v1.c | 32 +++++++++++++++++++++----------- > 2 files changed, 22 insertions(+), 11 deletions(-) > > diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h > index 2eaaabbe98cb..ca6f82abed62 100644 > --- a/include/linux/ceph/messenger.h > +++ b/include/linux/ceph/messenger.h > @@ -231,6 +231,7 @@ struct ceph_msg_data { > > struct ceph_msg_data_cursor { > size_t total_resid; /* across all data items */ > + size_t sr_total_resid; /* across all data items for sparse-read */ > > struct ceph_msg_data *data; /* current data item */ > size_t resid; /* bytes not yet consumed */ > diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c > index 4cb60bacf5f5..2733da891688 100644 > --- a/net/ceph/messenger_v1.c > +++ b/net/ceph/messenger_v1.c > @@ -160,7 +160,9 @@ static size_t sizeof_footer(struct ceph_connection *con) > static void prepare_message_data(struct ceph_msg *msg, u32 data_len) > { > /* Initialize data cursor if it's not a sparse read */ > - if (!msg->sparse_read) > + if (msg->sparse_read) > + msg->cursor.sr_total_resid = data_len; > + else > ceph_msg_data_cursor_init(&msg->cursor, msg, data_len); > } > > @@ -1032,35 +1034,43 @@ static int read_partial_sparse_msg_data(struct ceph_connection *con) > bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC); > u32 crc = 0; > int ret = 1; > + int len; > > if (do_datacrc) > crc = con->in_data_crc; > > - do { > - if (con->v1.in_sr_kvec.iov_base) > + while (cursor->sr_total_resid) { > + len = 0; > + if (con->v1.in_sr_kvec.iov_base) { > + len = con->v1.in_sr_kvec.iov_len; > ret = read_partial_message_chunk(con, > &con->v1.in_sr_kvec, > con->v1.in_sr_len, > &crc); > - else if (cursor->sr_resid > 0) > + len = con->v1.in_sr_kvec.iov_len - len; > + } else if (cursor->sr_resid > 0) { > + len = cursor->sr_resid; > ret = read_partial_sparse_msg_extent(con, &crc); > - > - if (ret <= 0) { > - if (do_datacrc) > - con->in_data_crc = crc; > - return ret; > + len -= cursor->sr_resid; > } > + cursor->sr_total_resid -= len; > + if (ret <= 0) > + break; > > memset(&con->v1.in_sr_kvec, 0, sizeof(con->v1.in_sr_kvec)); > ret = con->ops->sparse_read(con, cursor, > (char **)&con->v1.in_sr_kvec.iov_base); > + if (ret <= 0) { > + ret = ret ? : 1; /* must return > 0 to indicate success */ The above is a gcc-ism that we probably shouldn't be using. Can this be: ret = ret ? ret : 1; ? > + break; > + } > con->v1.in_sr_len = ret; > - } while (ret > 0); > + } > > if (do_datacrc) > con->in_data_crc = crc; > > - return ret < 0 ? ret : 1; /* must return > 0 to indicate success */ > + return ret; > } > > static int read_partial_msg_data(struct ceph_connection *con)
On Thu, 2024-01-18 at 18:50 +0800, xiubli@redhat.com wrote: > From: Xiubo Li <xiubli@redhat.com> > > The messages from ceph maybe split into multiple socket packages > and we just need to wait for all the data to be availiable on the > sokcet. > > This will add 'sr_total_resid' to record the total length for all > data items for sparse-read message and 'sr_resid_elen' to record > the current extent total length. > > URL: https://tracker.ceph.com/issues/63586 > Signed-off-by: Xiubo Li <xiubli@redhat.com> > --- > include/linux/ceph/messenger.h | 1 + > net/ceph/messenger_v1.c | 32 +++++++++++++++++++++----------- > 2 files changed, 22 insertions(+), 11 deletions(-) > > diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h > index 2eaaabbe98cb..ca6f82abed62 100644 > --- a/include/linux/ceph/messenger.h > +++ b/include/linux/ceph/messenger.h > @@ -231,6 +231,7 @@ struct ceph_msg_data { > > struct ceph_msg_data_cursor { > size_t total_resid; /* across all data items */ > + size_t sr_total_resid; /* across all data items for sparse-read */ > > struct ceph_msg_data *data; /* current data item */ > size_t resid; /* bytes not yet consumed */ > diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c > index 4cb60bacf5f5..2733da891688 100644 > --- a/net/ceph/messenger_v1.c > +++ b/net/ceph/messenger_v1.c > @@ -160,7 +160,9 @@ static size_t sizeof_footer(struct ceph_connection *con) > static void prepare_message_data(struct ceph_msg *msg, u32 data_len) > { > /* Initialize data cursor if it's not a sparse read */ > - if (!msg->sparse_read) > + if (msg->sparse_read) > + msg->cursor.sr_total_resid = data_len; > + else > ceph_msg_data_cursor_init(&msg->cursor, msg, data_len); > } > > @@ -1032,35 +1034,43 @@ static int read_partial_sparse_msg_data(struct ceph_connection *con) > bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC); > u32 crc = 0; > int ret = 1; > + int len; > > if (do_datacrc) > crc = con->in_data_crc; > > - do { > - if (con->v1.in_sr_kvec.iov_base) > + while (cursor->sr_total_resid) { > + len = 0; > + if (con->v1.in_sr_kvec.iov_base) { > + len = con->v1.in_sr_kvec.iov_len; > ret = read_partial_message_chunk(con, > &con->v1.in_sr_kvec, > con->v1.in_sr_len, > &crc); > - else if (cursor->sr_resid > 0) > + len = con->v1.in_sr_kvec.iov_len - len; > + } else if (cursor->sr_resid > 0) { > + len = cursor->sr_resid; > ret = read_partial_sparse_msg_extent(con, &crc); > - > - if (ret <= 0) { > - if (do_datacrc) > - con->in_data_crc = crc; > - return ret; > + len -= cursor->sr_resid; > } > + cursor->sr_total_resid -= len; > + if (ret <= 0) > + break; > > memset(&con->v1.in_sr_kvec, 0, sizeof(con->v1.in_sr_kvec)); > ret = con->ops->sparse_read(con, cursor, > (char **)&con->v1.in_sr_kvec.iov_base); > + if (ret <= 0) { > + ret = ret ? : 1; /* must return > 0 to indicate success */ > + break; > + } > con->v1.in_sr_len = ret; > - } while (ret > 0); > + } > > if (do_datacrc) > con->in_data_crc = crc; > > - return ret < 0 ? ret : 1; /* must return > 0 to indicate success */ > + return ret; > } > > static int read_partial_msg_data(struct ceph_connection *con) Looking back over this code... The way it works today, once we determine it's a sparse read, we call read_sparse_msg_data. At that point we call either read_partial_message_chunk (to read into the kvec) or read_sparse_msg_extent if sr_resid is already set (indicating that we're receiving an extent). read_sparse_msg_extent calls ceph_tcp_recvpage in a loop until cursor->sr_resid have been received. The exception there when ceph_tcp_recvpage returns <= 0. ceph_tcp_recvpage returns 0 if sock_recvmsg returns -EAGAIN (maybe also in other cases). So it sounds like the client just timed out on a read from the socket or caught a signal or something? If that's correct, then do we know what ceph_tcp_recvpage returned when the problem happened?
On 1/19/24 02:24, Jeff Layton wrote: > On Thu, 2024-01-18 at 18:50 +0800, xiubli@redhat.com wrote: >> From: Xiubo Li <xiubli@redhat.com> >> >> The messages from ceph maybe split into multiple socket packages >> and we just need to wait for all the data to be availiable on the >> sokcet. >> >> This will add 'sr_total_resid' to record the total length for all >> data items for sparse-read message and 'sr_resid_elen' to record >> the current extent total length. >> >> URL: https://tracker.ceph.com/issues/63586 >> Signed-off-by: Xiubo Li <xiubli@redhat.com> >> --- >> include/linux/ceph/messenger.h | 1 + >> net/ceph/messenger_v1.c | 32 +++++++++++++++++++++----------- >> 2 files changed, 22 insertions(+), 11 deletions(-) >> >> diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h >> index 2eaaabbe98cb..ca6f82abed62 100644 >> --- a/include/linux/ceph/messenger.h >> +++ b/include/linux/ceph/messenger.h >> @@ -231,6 +231,7 @@ struct ceph_msg_data { >> >> struct ceph_msg_data_cursor { >> size_t total_resid; /* across all data items */ >> + size_t sr_total_resid; /* across all data items for sparse-read */ >> >> struct ceph_msg_data *data; /* current data item */ >> size_t resid; /* bytes not yet consumed */ >> diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c >> index 4cb60bacf5f5..2733da891688 100644 >> --- a/net/ceph/messenger_v1.c >> +++ b/net/ceph/messenger_v1.c >> @@ -160,7 +160,9 @@ static size_t sizeof_footer(struct ceph_connection *con) >> static void prepare_message_data(struct ceph_msg *msg, u32 data_len) >> { >> /* Initialize data cursor if it's not a sparse read */ >> - if (!msg->sparse_read) >> + if (msg->sparse_read) >> + msg->cursor.sr_total_resid = data_len; >> + else >> ceph_msg_data_cursor_init(&msg->cursor, msg, data_len); >> } >> >> @@ -1032,35 +1034,43 @@ static int read_partial_sparse_msg_data(struct ceph_connection *con) >> bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC); >> u32 crc = 0; >> int ret = 1; >> + int len; >> >> if (do_datacrc) >> crc = con->in_data_crc; >> >> - do { >> - if (con->v1.in_sr_kvec.iov_base) >> + while (cursor->sr_total_resid) { >> + len = 0; >> + if (con->v1.in_sr_kvec.iov_base) { >> + len = con->v1.in_sr_kvec.iov_len; >> ret = read_partial_message_chunk(con, >> &con->v1.in_sr_kvec, >> con->v1.in_sr_len, >> &crc); >> - else if (cursor->sr_resid > 0) >> + len = con->v1.in_sr_kvec.iov_len - len; >> + } else if (cursor->sr_resid > 0) { >> + len = cursor->sr_resid; >> ret = read_partial_sparse_msg_extent(con, &crc); >> - >> - if (ret <= 0) { >> - if (do_datacrc) >> - con->in_data_crc = crc; >> - return ret; >> + len -= cursor->sr_resid; >> } >> + cursor->sr_total_resid -= len; >> + if (ret <= 0) >> + break; >> >> memset(&con->v1.in_sr_kvec, 0, sizeof(con->v1.in_sr_kvec)); >> ret = con->ops->sparse_read(con, cursor, >> (char **)&con->v1.in_sr_kvec.iov_base); >> + if (ret <= 0) { >> + ret = ret ? : 1; /* must return > 0 to indicate success */ >> + break; >> + } >> con->v1.in_sr_len = ret; >> - } while (ret > 0); >> + } >> >> if (do_datacrc) >> con->in_data_crc = crc; >> >> - return ret < 0 ? ret : 1; /* must return > 0 to indicate success */ >> + return ret; >> } >> >> static int read_partial_msg_data(struct ceph_connection *con) > Looking back over this code... > > The way it works today, once we determine it's a sparse read, we call > read_sparse_msg_data. At that point we call either > read_partial_message_chunk (to read into the kvec) or > read_sparse_msg_extent if sr_resid is already set (indicating that we're > receiving an extent). > > read_sparse_msg_extent calls ceph_tcp_recvpage in a loop until > cursor->sr_resid have been received. The exception there when > ceph_tcp_recvpage returns <= 0. > > ceph_tcp_recvpage returns 0 if sock_recvmsg returns -EAGAIN (maybe also > in other cases). So it sounds like the client just timed out on a read > from the socket or caught a signal or something? > > If that's correct, then do we know what ceph_tcp_recvpage returned when > the problem happened? It should just return parital data, and we should continue from here in the next loop when the reset data comes. Thanks - Xiubo
On Fri, 2024-01-19 at 12:35 +0800, Xiubo Li wrote: > On 1/19/24 02:24, Jeff Layton wrote: > > On Thu, 2024-01-18 at 18:50 +0800, xiubli@redhat.com wrote: > > > From: Xiubo Li <xiubli@redhat.com> > > > > > > The messages from ceph maybe split into multiple socket packages > > > and we just need to wait for all the data to be availiable on the > > > sokcet. > > > > > > This will add 'sr_total_resid' to record the total length for all > > > data items for sparse-read message and 'sr_resid_elen' to record > > > the current extent total length. > > > > > > URL: https://tracker.ceph.com/issues/63586 > > > Signed-off-by: Xiubo Li <xiubli@redhat.com> > > > --- > > > include/linux/ceph/messenger.h | 1 + > > > net/ceph/messenger_v1.c | 32 +++++++++++++++++++++----------- > > > 2 files changed, 22 insertions(+), 11 deletions(-) > > > > > > diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h > > > index 2eaaabbe98cb..ca6f82abed62 100644 > > > --- a/include/linux/ceph/messenger.h > > > +++ b/include/linux/ceph/messenger.h > > > @@ -231,6 +231,7 @@ struct ceph_msg_data { > > > > > > struct ceph_msg_data_cursor { > > > size_t total_resid; /* across all data items */ > > > + size_t sr_total_resid; /* across all data items for sparse-read */ > > > > > > struct ceph_msg_data *data; /* current data item */ > > > size_t resid; /* bytes not yet consumed */ > > > diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c > > > index 4cb60bacf5f5..2733da891688 100644 > > > --- a/net/ceph/messenger_v1.c > > > +++ b/net/ceph/messenger_v1.c > > > @@ -160,7 +160,9 @@ static size_t sizeof_footer(struct ceph_connection *con) > > > static void prepare_message_data(struct ceph_msg *msg, u32 data_len) > > > { > > > /* Initialize data cursor if it's not a sparse read */ > > > - if (!msg->sparse_read) > > > + if (msg->sparse_read) > > > + msg->cursor.sr_total_resid = data_len; > > > + else > > > ceph_msg_data_cursor_init(&msg->cursor, msg, data_len); > > > } > > > > > > @@ -1032,35 +1034,43 @@ static int read_partial_sparse_msg_data(struct ceph_connection *con) > > > bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC); > > > u32 crc = 0; > > > int ret = 1; > > > + int len; > > > > > > if (do_datacrc) > > > crc = con->in_data_crc; > > > > > > - do { > > > - if (con->v1.in_sr_kvec.iov_base) > > > + while (cursor->sr_total_resid) { > > > + len = 0; > > > + if (con->v1.in_sr_kvec.iov_base) { > > > + len = con->v1.in_sr_kvec.iov_len; > > > ret = read_partial_message_chunk(con, > > > &con->v1.in_sr_kvec, > > > con->v1.in_sr_len, > > > &crc); > > > - else if (cursor->sr_resid > 0) > > > + len = con->v1.in_sr_kvec.iov_len - len; > > > + } else if (cursor->sr_resid > 0) { > > > + len = cursor->sr_resid; > > > ret = read_partial_sparse_msg_extent(con, &crc); > > > - > > > - if (ret <= 0) { > > > - if (do_datacrc) > > > - con->in_data_crc = crc; > > > - return ret; > > > + len -= cursor->sr_resid; > > > } > > > + cursor->sr_total_resid -= len; > > > + if (ret <= 0) > > > + break; > > > > > > memset(&con->v1.in_sr_kvec, 0, sizeof(con->v1.in_sr_kvec)); > > > ret = con->ops->sparse_read(con, cursor, > > > (char **)&con->v1.in_sr_kvec.iov_base); > > > + if (ret <= 0) { > > > + ret = ret ? : 1; /* must return > 0 to indicate success */ > > > + break; > > > + } > > > con->v1.in_sr_len = ret; > > > - } while (ret > 0); > > > + } > > > > > > if (do_datacrc) > > > con->in_data_crc = crc; > > > > > > - return ret < 0 ? ret : 1; /* must return > 0 to indicate success */ > > > + return ret; > > > } > > > > > > static int read_partial_msg_data(struct ceph_connection *con) > > Looking back over this code... > > > > The way it works today, once we determine it's a sparse read, we call > > read_sparse_msg_data. At that point we call either > > read_partial_message_chunk (to read into the kvec) or > > read_sparse_msg_extent if sr_resid is already set (indicating that we're > > receiving an extent). > > > > read_sparse_msg_extent calls ceph_tcp_recvpage in a loop until > > cursor->sr_resid have been received. The exception there when > > ceph_tcp_recvpage returns <= 0. > > > > ceph_tcp_recvpage returns 0 if sock_recvmsg returns -EAGAIN (maybe also > > in other cases). So it sounds like the client just timed out on a read > > from the socket or caught a signal or something? > > > > If that's correct, then do we know what ceph_tcp_recvpage returned when > > the problem happened? > > It should just return parital data, and we should continue from here in > the next loop when the reset data comes. > Tracking this extra length seems like the wrong fix. We're already looping in read_sparse_msg_extent until the sr_resid goes to 0. ISTM that it's just that read_sparse_msg_extent is returning inappropriately in the face of timeouts. IOW, it does this: ret = ceph_tcp_recvpage(con->sock, rpage, (int)off, len); if (ret <= 0) return ret; ...should it just not be returning there when ret == 0? Maybe it should be retrying the recvpage instead?
On 1/19/24 19:09, Jeff Layton wrote: > On Fri, 2024-01-19 at 12:35 +0800, Xiubo Li wrote: >> On 1/19/24 02:24, Jeff Layton wrote: >>> On Thu, 2024-01-18 at 18:50 +0800, xiubli@redhat.com wrote: >>>> From: Xiubo Li <xiubli@redhat.com> >>>> >>>> The messages from ceph maybe split into multiple socket packages >>>> and we just need to wait for all the data to be availiable on the >>>> sokcet. >>>> >>>> This will add 'sr_total_resid' to record the total length for all >>>> data items for sparse-read message and 'sr_resid_elen' to record >>>> the current extent total length. >>>> >>>> URL: https://tracker.ceph.com/issues/63586 >>>> Signed-off-by: Xiubo Li <xiubli@redhat.com> >>>> --- >>>> include/linux/ceph/messenger.h | 1 + >>>> net/ceph/messenger_v1.c | 32 +++++++++++++++++++++----------- >>>> 2 files changed, 22 insertions(+), 11 deletions(-) >>>> >>>> diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h >>>> index 2eaaabbe98cb..ca6f82abed62 100644 >>>> --- a/include/linux/ceph/messenger.h >>>> +++ b/include/linux/ceph/messenger.h >>>> @@ -231,6 +231,7 @@ struct ceph_msg_data { >>>> >>>> struct ceph_msg_data_cursor { >>>> size_t total_resid; /* across all data items */ >>>> + size_t sr_total_resid; /* across all data items for sparse-read */ >>>> >>>> struct ceph_msg_data *data; /* current data item */ >>>> size_t resid; /* bytes not yet consumed */ >>>> diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c >>>> index 4cb60bacf5f5..2733da891688 100644 >>>> --- a/net/ceph/messenger_v1.c >>>> +++ b/net/ceph/messenger_v1.c >>>> @@ -160,7 +160,9 @@ static size_t sizeof_footer(struct ceph_connection *con) >>>> static void prepare_message_data(struct ceph_msg *msg, u32 data_len) >>>> { >>>> /* Initialize data cursor if it's not a sparse read */ >>>> - if (!msg->sparse_read) >>>> + if (msg->sparse_read) >>>> + msg->cursor.sr_total_resid = data_len; >>>> + else >>>> ceph_msg_data_cursor_init(&msg->cursor, msg, data_len); >>>> } >>>> >>>> @@ -1032,35 +1034,43 @@ static int read_partial_sparse_msg_data(struct ceph_connection *con) >>>> bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC); >>>> u32 crc = 0; >>>> int ret = 1; >>>> + int len; >>>> >>>> if (do_datacrc) >>>> crc = con->in_data_crc; >>>> >>>> - do { >>>> - if (con->v1.in_sr_kvec.iov_base) >>>> + while (cursor->sr_total_resid) { >>>> + len = 0; >>>> + if (con->v1.in_sr_kvec.iov_base) { >>>> + len = con->v1.in_sr_kvec.iov_len; >>>> ret = read_partial_message_chunk(con, >>>> &con->v1.in_sr_kvec, >>>> con->v1.in_sr_len, >>>> &crc); >>>> - else if (cursor->sr_resid > 0) >>>> + len = con->v1.in_sr_kvec.iov_len - len; >>>> + } else if (cursor->sr_resid > 0) { >>>> + len = cursor->sr_resid; >>>> ret = read_partial_sparse_msg_extent(con, &crc); >>>> - >>>> - if (ret <= 0) { >>>> - if (do_datacrc) >>>> - con->in_data_crc = crc; >>>> - return ret; >>>> + len -= cursor->sr_resid; >>>> } >>>> + cursor->sr_total_resid -= len; >>>> + if (ret <= 0) >>>> + break; >>>> >>>> memset(&con->v1.in_sr_kvec, 0, sizeof(con->v1.in_sr_kvec)); >>>> ret = con->ops->sparse_read(con, cursor, >>>> (char **)&con->v1.in_sr_kvec.iov_base); >>>> + if (ret <= 0) { >>>> + ret = ret ? : 1; /* must return > 0 to indicate success */ >>>> + break; >>>> + } >>>> con->v1.in_sr_len = ret; >>>> - } while (ret > 0); >>>> + } >>>> >>>> if (do_datacrc) >>>> con->in_data_crc = crc; >>>> >>>> - return ret < 0 ? ret : 1; /* must return > 0 to indicate success */ >>>> + return ret; >>>> } >>>> >>>> static int read_partial_msg_data(struct ceph_connection *con) >>> Looking back over this code... >>> >>> The way it works today, once we determine it's a sparse read, we call >>> read_sparse_msg_data. At that point we call either >>> read_partial_message_chunk (to read into the kvec) or >>> read_sparse_msg_extent if sr_resid is already set (indicating that we're >>> receiving an extent). >>> >>> read_sparse_msg_extent calls ceph_tcp_recvpage in a loop until >>> cursor->sr_resid have been received. The exception there when >>> ceph_tcp_recvpage returns <= 0. >>> >>> ceph_tcp_recvpage returns 0 if sock_recvmsg returns -EAGAIN (maybe also >>> in other cases). So it sounds like the client just timed out on a read >>> from the socket or caught a signal or something? >>> >>> If that's correct, then do we know what ceph_tcp_recvpage returned when >>> the problem happened? >> It should just return parital data, and we should continue from here in >> the next loop when the reset data comes. >> > Tracking this extra length seems like the wrong fix. We're already > looping in read_sparse_msg_extent until the sr_resid goes to 0. Yeah, it is and it works well. > ISTM > that it's just that read_sparse_msg_extent is returning inappropriately > in the face of timeouts. > > IOW, it does this: > > ret = ceph_tcp_recvpage(con->sock, rpage, (int)off, len); > if (ret <= 0) > return ret; > > ...should it just not be returning there when ret == 0? Maybe it should > be retrying the recvpage instead? Currently the the ceph_tcp_recvpage() will read data without blocking. If so we will change the logic here then all the other non-sparse-read cases will be changed to. Please note this won't fix anything here in this bug. Because currently the sparse-read code works well if in step 4 it partially read the sparse-read data or extents. But in case of partially reading 'footer' in step 5. What we need to guarantee is that in the second loop we could skip triggering a new sparse-read in step 4: 1, /* header */ ===> will skip and do nothing if it has already read the 'header' data in last loop 2, /* front */ ===> will skip and do nothing if it has already read the 'front' data in last loop 3, /* middle */ ===> will skip and do nothing if it has already read the 'middle' data in last loop 4, /* (page) data */ ===> sparse-read here, it also should skip and do nothing if it has already read the whole 'sparse-read' data in last loop, but it won't. This is the ROOT CAUSE of this bug. 5, /* footer */ ===> the 'read_partial()' will only read partial 'footer' data then need to loop start from /* header */ when the data comes My patch could guarantee that the sparse-read code will do nothing. While currently the code will trigger a new sparse-read from beginning again, which is incorrect. Jeff, please let me know if you have better approaches. The last one Ilya mentioned didn't work. Thanks - Xiubo
On Mon, 2024-01-22 at 10:52 +0800, Xiubo Li wrote: > On 1/19/24 19:09, Jeff Layton wrote: > > On Fri, 2024-01-19 at 12:35 +0800, Xiubo Li wrote: > > > On 1/19/24 02:24, Jeff Layton wrote: > > > > On Thu, 2024-01-18 at 18:50 +0800, xiubli@redhat.com wrote: > > > > > From: Xiubo Li <xiubli@redhat.com> > > > > > > > > > > The messages from ceph maybe split into multiple socket packages > > > > > and we just need to wait for all the data to be availiable on the > > > > > sokcet. > > > > > > > > > > This will add 'sr_total_resid' to record the total length for all > > > > > data items for sparse-read message and 'sr_resid_elen' to record > > > > > the current extent total length. > > > > > > > > > > URL: https://tracker.ceph.com/issues/63586 > > > > > Signed-off-by: Xiubo Li <xiubli@redhat.com> > > > > > --- > > > > > include/linux/ceph/messenger.h | 1 + > > > > > net/ceph/messenger_v1.c | 32 +++++++++++++++++++++----------- > > > > > 2 files changed, 22 insertions(+), 11 deletions(-) > > > > > > > > > > diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h > > > > > index 2eaaabbe98cb..ca6f82abed62 100644 > > > > > --- a/include/linux/ceph/messenger.h > > > > > +++ b/include/linux/ceph/messenger.h > > > > > @@ -231,6 +231,7 @@ struct ceph_msg_data { > > > > > > > > > > struct ceph_msg_data_cursor { > > > > > size_t total_resid; /* across all data items */ > > > > > + size_t sr_total_resid; /* across all data items for sparse-read */ > > > > > > > > > > struct ceph_msg_data *data; /* current data item */ > > > > > size_t resid; /* bytes not yet consumed */ > > > > > diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c > > > > > index 4cb60bacf5f5..2733da891688 100644 > > > > > --- a/net/ceph/messenger_v1.c > > > > > +++ b/net/ceph/messenger_v1.c > > > > > @@ -160,7 +160,9 @@ static size_t sizeof_footer(struct ceph_connection *con) > > > > > static void prepare_message_data(struct ceph_msg *msg, u32 data_len) > > > > > { > > > > > /* Initialize data cursor if it's not a sparse read */ > > > > > - if (!msg->sparse_read) > > > > > + if (msg->sparse_read) > > > > > + msg->cursor.sr_total_resid = data_len; > > > > > + else > > > > > ceph_msg_data_cursor_init(&msg->cursor, msg, data_len); > > > > > } > > > > > > > > > > @@ -1032,35 +1034,43 @@ static int read_partial_sparse_msg_data(struct ceph_connection *con) > > > > > bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC); > > > > > u32 crc = 0; > > > > > int ret = 1; > > > > > + int len; > > > > > > > > > > if (do_datacrc) > > > > > crc = con->in_data_crc; > > > > > > > > > > - do { > > > > > - if (con->v1.in_sr_kvec.iov_base) > > > > > + while (cursor->sr_total_resid) { > > > > > + len = 0; > > > > > + if (con->v1.in_sr_kvec.iov_base) { > > > > > + len = con->v1.in_sr_kvec.iov_len; > > > > > ret = read_partial_message_chunk(con, > > > > > &con->v1.in_sr_kvec, > > > > > con->v1.in_sr_len, > > > > > &crc); > > > > > - else if (cursor->sr_resid > 0) > > > > > + len = con->v1.in_sr_kvec.iov_len - len; > > > > > + } else if (cursor->sr_resid > 0) { > > > > > + len = cursor->sr_resid; > > > > > ret = read_partial_sparse_msg_extent(con, &crc); > > > > > - > > > > > - if (ret <= 0) { > > > > > - if (do_datacrc) > > > > > - con->in_data_crc = crc; > > > > > - return ret; > > > > > + len -= cursor->sr_resid; > > > > > } > > > > > + cursor->sr_total_resid -= len; > > > > > + if (ret <= 0) > > > > > + break; > > > > > > > > > > memset(&con->v1.in_sr_kvec, 0, sizeof(con->v1.in_sr_kvec)); > > > > > ret = con->ops->sparse_read(con, cursor, > > > > > (char **)&con->v1.in_sr_kvec.iov_base); > > > > > + if (ret <= 0) { > > > > > + ret = ret ? : 1; /* must return > 0 to indicate success */ > > > > > + break; > > > > > + } > > > > > con->v1.in_sr_len = ret; > > > > > - } while (ret > 0); > > > > > + } > > > > > > > > > > if (do_datacrc) > > > > > con->in_data_crc = crc; > > > > > > > > > > - return ret < 0 ? ret : 1; /* must return > 0 to indicate success */ > > > > > + return ret; > > > > > } > > > > > > > > > > static int read_partial_msg_data(struct ceph_connection *con) > > > > Looking back over this code... > > > > > > > > The way it works today, once we determine it's a sparse read, we call > > > > read_sparse_msg_data. At that point we call either > > > > read_partial_message_chunk (to read into the kvec) or > > > > read_sparse_msg_extent if sr_resid is already set (indicating that we're > > > > receiving an extent). > > > > > > > > read_sparse_msg_extent calls ceph_tcp_recvpage in a loop until > > > > cursor->sr_resid have been received. The exception there when > > > > ceph_tcp_recvpage returns <= 0. > > > > > > > > ceph_tcp_recvpage returns 0 if sock_recvmsg returns -EAGAIN (maybe also > > > > in other cases). So it sounds like the client just timed out on a read > > > > from the socket or caught a signal or something? > > > > > > > > If that's correct, then do we know what ceph_tcp_recvpage returned when > > > > the problem happened? > > > It should just return parital data, and we should continue from here in > > > the next loop when the reset data comes. > > > > > Tracking this extra length seems like the wrong fix. We're already > > looping in read_sparse_msg_extent until the sr_resid goes to 0. > Yeah, it is and it works well. > > ISTM > > that it's just that read_sparse_msg_extent is returning inappropriately > > in the face of timeouts. > > > > IOW, it does this: > > > > ret = ceph_tcp_recvpage(con->sock, rpage, (int)off, len); > > if (ret <= 0) > > return ret; > > > > ...should it just not be returning there when ret == 0? Maybe it should > > be retrying the recvpage instead? > > Currently the the ceph_tcp_recvpage() will read data without blocking. Yes. > If so we will change the logic here then all the other non-sparse-read > cases will be changed to. > No. read_sparse_msg_data is only called from the sparse-read codepath. If we change it, only that will be affected. > Please note this won't fix anything here in this bug. > > Because currently the sparse-read code works well if in step 4 it > partially read the sparse-read data or extents. > > But in case of partially reading 'footer' in step 5. What we need to > guarantee is that in the second loop we could skip triggering a new > sparse-read in step 4: > > 1, /* header */ ===> will skip and do nothing if it has already > read the 'header' data in last loop > > 2, /* front */ ===> will skip and do nothing if it has > already read the 'front' data in last loop > > 3, /* middle */ ===> will skip and do nothing if it has already > read the 'middle' data in last loop > > 4, /* (page) data */ ===> sparse-read here, it also should skip and do > nothing if it has already read the whole 'sparse-read' data in last > loop, but it won't. This is the ROOT CAUSE of this bug. > > 5, /* footer */ ===> the 'read_partial()' will only read > partial 'footer' data then need to loop start from /* header */ when the > data comes > > My patch could guarantee that the sparse-read code will do nothing. > While currently the code will trigger a new sparse-read from beginning > again, which is incorrect. > > Jeff, please let me know if you have better approaches. The last one > Ilya mentioned didn't work. Your patch tries to track sr_resid independently in a new variable sr_total_resid, but I think that's unnecessary. read_sparse_msg_data returns under two conditions: 1. It has read all of the sparse read data (i.e. sr_resid is 0), in which case it returns 1. ...or... 2. ceph_tcp_recvpage returns a negative error code or 0. After your patch, the only way you'd get a case where sr_total_resid is >0 is if case #2 happens. Clearly if we receive all of the data then sr_total_resid will also be 0. We want to return an error if there is a hard error from ceph_tcp_recvpage, but it looks like it also returns 0 if the socket read returns -EAGAIN. So, it seems to be that doing something like this would be a sufficient fix. What am I missing? diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c index f9a50d7f0d20..cf94ebdb3b34 100644 --- a/net/ceph/messenger_v1.c +++ b/net/ceph/messenger_v1.c @@ -1015,8 +1015,10 @@ static int read_sparse_msg_extent(struct ceph_connection *con, u32 *crc) /* clamp to what remains in extent */ len = min_t(int, len, cursor->sr_resid); ret = ceph_tcp_recvpage(con->sock, rpage, (int)off, len); - if (ret <= 0) + if (ret < 0) return ret; + else if (ret == 0) + continue; *crc = ceph_crc32c_page(*crc, rpage, off, ret); ceph_msg_data_advance(cursor, (size_t)ret); cursor->sr_resid -= ret;
On Thu, 2024-01-18 at 18:50 +0800, xiubli@redhat.com wrote: > From: Xiubo Li <xiubli@redhat.com> > > The messages from ceph maybe split into multiple socket packages > and we just need to wait for all the data to be availiable on the > sokcet. > > This will add 'sr_total_resid' to record the total length for all > data items for sparse-read message and 'sr_resid_elen' to record > the current extent total length. > > URL: https://tracker.ceph.com/issues/63586 > Signed-off-by: Xiubo Li <xiubli@redhat.com> > --- > include/linux/ceph/messenger.h | 1 + > net/ceph/messenger_v1.c | 32 +++++++++++++++++++++----------- > 2 files changed, 22 insertions(+), 11 deletions(-) > > diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h > index 2eaaabbe98cb..ca6f82abed62 100644 > --- a/include/linux/ceph/messenger.h > +++ b/include/linux/ceph/messenger.h > @@ -231,6 +231,7 @@ struct ceph_msg_data { > > struct ceph_msg_data_cursor { > size_t total_resid; /* across all data items */ > + size_t sr_total_resid; /* across all data items for sparse-read */ > > struct ceph_msg_data *data; /* current data item */ > size_t resid; /* bytes not yet consumed */ > diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c > index 4cb60bacf5f5..2733da891688 100644 > --- a/net/ceph/messenger_v1.c > +++ b/net/ceph/messenger_v1.c > @@ -160,7 +160,9 @@ static size_t sizeof_footer(struct ceph_connection *con) > static void prepare_message_data(struct ceph_msg *msg, u32 data_len) > { > /* Initialize data cursor if it's not a sparse read */ > - if (!msg->sparse_read) > + if (msg->sparse_read) > + msg->cursor.sr_total_resid = data_len; > + else > ceph_msg_data_cursor_init(&msg->cursor, msg, data_len); > } > > Sorry, disregard my last email. I went and looked at the patch in the tree, and I better understand what you're trying to do. We already have a value that gets set to data_len called total_resid, but that is a nonsense value in a sparse read, because we can advance farther through the receive buffer than the amount of data in the socket. So, I think you do need a separate value like this that's only used in sparse reads, or you'll have to change how ->total_resid is handled during a sparse read. The first way is simpler, so I agree with the approach. That said, if you're going to add a new value to ceph_msg_data_cursor, then you really ought to initialize it in ceph_msg_data_cursor_init. You don't necessarily have to use it elsewhere, but that would be cleaner than the if statement above. More comments below. > > @@ -1032,35 +1034,43 @@ static int read_partial_sparse_msg_data(struct ceph_connection *con) > bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC); > u32 crc = 0; > int ret = 1; > + int len; > > if (do_datacrc) > crc = con->in_data_crc; > > - do { > - if (con->v1.in_sr_kvec.iov_base) > + while (cursor->sr_total_resid) { > + len = 0; > + if (con->v1.in_sr_kvec.iov_base) { > + len = con->v1.in_sr_kvec.iov_len; > ret = read_partial_message_chunk(con, > &con->v1.in_sr_kvec, > con->v1.in_sr_len, > &crc); > - else if (cursor->sr_resid > 0) > + len = con->v1.in_sr_kvec.iov_len - len; len is going to be 0 after the above statement. I'm not sure I understand the point of messing with its value in the above block. > + } else if (cursor->sr_resid > 0) { > + len = cursor->sr_resid; > ret = read_partial_sparse_msg_extent(con, &crc); > - > - if (ret <= 0) { > - if (do_datacrc) > - con->in_data_crc = crc; > - return ret; > + len -= cursor->sr_resid; Won't "len" also be 0 after the above? > } > + cursor->sr_total_resid -= len; ...and so sr_total_resid always decrements by 0? That doesn't look like it's doing the right thing. > + if (ret <= 0) > + break; > > memset(&con->v1.in_sr_kvec, 0, sizeof(con->v1.in_sr_kvec)); > ret = con->ops->sparse_read(con, cursor, > (char **)&con->v1.in_sr_kvec.iov_base); > + if (ret <= 0) { > + ret = ret ? : 1; /* must return > 0 to indicate success */ > + break; > + } > con->v1.in_sr_len = ret; > - } while (ret > 0); > + } > > if (do_datacrc) > con->in_data_crc = crc; > > - return ret < 0 ? ret : 1; /* must return > 0 to indicate success */ > + return ret; > } > > static int read_partial_msg_data(struct ceph_connection *con)
On Mon, Jan 22, 2024 at 4:02 PM Jeff Layton <jlayton@kernel.org> wrote: > > On Thu, 2024-01-18 at 18:50 +0800, xiubli@redhat.com wrote: > > From: Xiubo Li <xiubli@redhat.com> > > > > The messages from ceph maybe split into multiple socket packages > > and we just need to wait for all the data to be availiable on the > > sokcet. > > > > This will add 'sr_total_resid' to record the total length for all > > data items for sparse-read message and 'sr_resid_elen' to record > > the current extent total length. > > > > URL: https://tracker.ceph.com/issues/63586 > > Signed-off-by: Xiubo Li <xiubli@redhat.com> > > --- > > include/linux/ceph/messenger.h | 1 + > > net/ceph/messenger_v1.c | 32 +++++++++++++++++++++----------- > > 2 files changed, 22 insertions(+), 11 deletions(-) > > > > diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h > > index 2eaaabbe98cb..ca6f82abed62 100644 > > --- a/include/linux/ceph/messenger.h > > +++ b/include/linux/ceph/messenger.h > > @@ -231,6 +231,7 @@ struct ceph_msg_data { > > > > struct ceph_msg_data_cursor { > > size_t total_resid; /* across all data items */ > > + size_t sr_total_resid; /* across all data items for sparse-read */ > > > > struct ceph_msg_data *data; /* current data item */ > > size_t resid; /* bytes not yet consumed */ > > diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c > > index 4cb60bacf5f5..2733da891688 100644 > > --- a/net/ceph/messenger_v1.c > > +++ b/net/ceph/messenger_v1.c > > @@ -160,7 +160,9 @@ static size_t sizeof_footer(struct ceph_connection *con) > > static void prepare_message_data(struct ceph_msg *msg, u32 data_len) > > { > > /* Initialize data cursor if it's not a sparse read */ > > - if (!msg->sparse_read) > > + if (msg->sparse_read) > > + msg->cursor.sr_total_resid = data_len; > > + else > > ceph_msg_data_cursor_init(&msg->cursor, msg, data_len); > > } > > > > > > Sorry, disregard my last email. > > I went and looked at the patch in the tree, and I better understand what > you're trying to do. We already have a value that gets set to data_len > called total_resid, but that is a nonsense value in a sparse read, > because we can advance farther through the receive buffer than the > amount of data in the socket. Hi Jeff, I see that total_resid is set to sparse_data_requested(), which is effectively the size of the receive buffer, not data_len. (This is ignoring the seemingly unnecessary complication of trying to support normal reads mixed with sparse reads in the same message, which I'm pretty sure doesn't work anyway.) With that, total_resid should represent the amount that needs to be filled in (advanced through) in the receive buffer. When total_resid reaches 0, wouldn't that mean that the amount of data in the socket is also 0? If not, where would the remaining data in the socket go? Thanks, Ilya
On Mon, 2024-01-22 at 17:55 +0100, Ilya Dryomov wrote: > On Mon, Jan 22, 2024 at 4:02 PM Jeff Layton <jlayton@kernel.org> wrote: > > > > On Thu, 2024-01-18 at 18:50 +0800, xiubli@redhat.com wrote: > > > From: Xiubo Li <xiubli@redhat.com> > > > > > > The messages from ceph maybe split into multiple socket packages > > > and we just need to wait for all the data to be availiable on the > > > sokcet. > > > > > > This will add 'sr_total_resid' to record the total length for all > > > data items for sparse-read message and 'sr_resid_elen' to record > > > the current extent total length. > > > > > > URL: https://tracker.ceph.com/issues/63586 > > > Signed-off-by: Xiubo Li <xiubli@redhat.com> > > > --- > > > include/linux/ceph/messenger.h | 1 + > > > net/ceph/messenger_v1.c | 32 +++++++++++++++++++++----------- > > > 2 files changed, 22 insertions(+), 11 deletions(-) > > > > > > diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h > > > index 2eaaabbe98cb..ca6f82abed62 100644 > > > --- a/include/linux/ceph/messenger.h > > > +++ b/include/linux/ceph/messenger.h > > > @@ -231,6 +231,7 @@ struct ceph_msg_data { > > > > > > struct ceph_msg_data_cursor { > > > size_t total_resid; /* across all data items */ > > > + size_t sr_total_resid; /* across all data items for sparse-read */ > > > > > > struct ceph_msg_data *data; /* current data item */ > > > size_t resid; /* bytes not yet consumed */ > > > diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c > > > index 4cb60bacf5f5..2733da891688 100644 > > > --- a/net/ceph/messenger_v1.c > > > +++ b/net/ceph/messenger_v1.c > > > @@ -160,7 +160,9 @@ static size_t sizeof_footer(struct ceph_connection *con) > > > static void prepare_message_data(struct ceph_msg *msg, u32 data_len) > > > { > > > /* Initialize data cursor if it's not a sparse read */ > > > - if (!msg->sparse_read) > > > + if (msg->sparse_read) > > > + msg->cursor.sr_total_resid = data_len; > > > + else > > > ceph_msg_data_cursor_init(&msg->cursor, msg, data_len); > > > } > > > > > > > > > > Sorry, disregard my last email. > > > > I went and looked at the patch in the tree, and I better understand what > > you're trying to do. We already have a value that gets set to data_len > > called total_resid, but that is a nonsense value in a sparse read, > > because we can advance farther through the receive buffer than the > > amount of data in the socket. > > Hi Jeff, > > I see that total_resid is set to sparse_data_requested(), which is > effectively the size of the receive buffer, not data_len. (This is > ignoring the seemingly unnecessary complication of trying to support > normal reads mixed with sparse reads in the same message, which I'm > pretty sure doesn't work anyway.) > Oh right. I missed that bit when I was re-reviewing this. Once we're in a sparse read we'll override that value. Ok, so maybe we don't need sr_total_resid. > With that, total_resid should represent the amount that needs to be > filled in (advanced through) in the receive buffer. When total_resid > reaches 0, wouldn't that mean that the amount of data in the socket is > also 0? If not, where would the remaining data in the socket go? > With a properly formed reply, then I think yes, there should be no remaining data in the socket at the end of the receive. At this point I think I must just be confused about the actual problem. I think I need a detailed description of it before I can properly review this patch.
On Mon, Jan 22, 2024 at 6:14 PM Jeff Layton <jlayton@kernel.org> wrote: > > On Mon, 2024-01-22 at 17:55 +0100, Ilya Dryomov wrote: > > On Mon, Jan 22, 2024 at 4:02 PM Jeff Layton <jlayton@kernel.org> wrote: > > > > > > On Thu, 2024-01-18 at 18:50 +0800, xiubli@redhat.com wrote: > > > > From: Xiubo Li <xiubli@redhat.com> > > > > > > > > The messages from ceph maybe split into multiple socket packages > > > > and we just need to wait for all the data to be availiable on the > > > > sokcet. > > > > > > > > This will add 'sr_total_resid' to record the total length for all > > > > data items for sparse-read message and 'sr_resid_elen' to record > > > > the current extent total length. > > > > > > > > URL: https://tracker.ceph.com/issues/63586 > > > > Signed-off-by: Xiubo Li <xiubli@redhat.com> > > > > --- > > > > include/linux/ceph/messenger.h | 1 + > > > > net/ceph/messenger_v1.c | 32 +++++++++++++++++++++----------- > > > > 2 files changed, 22 insertions(+), 11 deletions(-) > > > > > > > > diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h > > > > index 2eaaabbe98cb..ca6f82abed62 100644 > > > > --- a/include/linux/ceph/messenger.h > > > > +++ b/include/linux/ceph/messenger.h > > > > @@ -231,6 +231,7 @@ struct ceph_msg_data { > > > > > > > > struct ceph_msg_data_cursor { > > > > size_t total_resid; /* across all data items */ > > > > + size_t sr_total_resid; /* across all data items for sparse-read */ > > > > > > > > struct ceph_msg_data *data; /* current data item */ > > > > size_t resid; /* bytes not yet consumed */ > > > > diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c > > > > index 4cb60bacf5f5..2733da891688 100644 > > > > --- a/net/ceph/messenger_v1.c > > > > +++ b/net/ceph/messenger_v1.c > > > > @@ -160,7 +160,9 @@ static size_t sizeof_footer(struct ceph_connection *con) > > > > static void prepare_message_data(struct ceph_msg *msg, u32 data_len) > > > > { > > > > /* Initialize data cursor if it's not a sparse read */ > > > > - if (!msg->sparse_read) > > > > + if (msg->sparse_read) > > > > + msg->cursor.sr_total_resid = data_len; > > > > + else > > > > ceph_msg_data_cursor_init(&msg->cursor, msg, data_len); > > > > } > > > > > > > > > > > > > > Sorry, disregard my last email. > > > > > > I went and looked at the patch in the tree, and I better understand what > > > you're trying to do. We already have a value that gets set to data_len > > > called total_resid, but that is a nonsense value in a sparse read, > > > because we can advance farther through the receive buffer than the > > > amount of data in the socket. > > > > Hi Jeff, > > > > I see that total_resid is set to sparse_data_requested(), which is > > effectively the size of the receive buffer, not data_len. (This is > > ignoring the seemingly unnecessary complication of trying to support > > normal reads mixed with sparse reads in the same message, which I'm > > pretty sure doesn't work anyway.) > > > > Oh right. I missed that bit when I was re-reviewing this. Once we're in > a sparse read we'll override that value. Ok, so maybe we don't need > sr_total_resid. > > > With that, total_resid should represent the amount that needs to be > > filled in (advanced through) in the receive buffer. When total_resid > > reaches 0, wouldn't that mean that the amount of data in the socket is > > also 0? If not, where would the remaining data in the socket go? > > > > With a properly formed reply, then I think yes, there should be no > remaining data in the socket at the end of the receive. There would be no actual data, but a message footer which follows the data section and ends the message would be outstanding. > > At this point I think I must just be confused about the actual problem. > I think I need a detailed description of it before I can properly review > this patch. AFAIU the problem is that a short read may occur while reading the message footer from the socket. Later, when the socket is ready for another read, the messenger invokes all read_partial* handlers, including read_partial_sparse_msg_data(). The contract between the messenger and these handlers is that the handler should bail if the area of the message it's responsible for is already processed. So, in this case, it's expected that read_sparse_msg_data() would bail, allowing the messenger to invoke read_partial() for the footer and pick up where it left off. However read_sparse_msg_data() violates that contract and ends up calling into the state machine in the OSD client. The state machine assumes that it's a new op and interprets some piece of the footer (or even completely random memory?) as the sparse-read header and returns bogus extent length, etc. (BTW it's why I suggested the rename from read_sparse_msg_data() to read_partial_sparse_msg_data() in another patch -- to highlight that it's one of the "partial" handlers and the respective behavior.) Thanks, Ilya
On 1/23/24 03:41, Ilya Dryomov wrote: > On Mon, Jan 22, 2024 at 6:14 PM Jeff Layton <jlayton@kernel.org> wrote: >> On Mon, 2024-01-22 at 17:55 +0100, Ilya Dryomov wrote: >>> On Mon, Jan 22, 2024 at 4:02 PM Jeff Layton <jlayton@kernel.org> wrote: >>>> On Thu, 2024-01-18 at 18:50 +0800, xiubli@redhat.com wrote: >>>>> From: Xiubo Li <xiubli@redhat.com> >>>>> >>>>> The messages from ceph maybe split into multiple socket packages >>>>> and we just need to wait for all the data to be availiable on the >>>>> sokcet. >>>>> >>>>> This will add 'sr_total_resid' to record the total length for all >>>>> data items for sparse-read message and 'sr_resid_elen' to record >>>>> the current extent total length. >>>>> >>>>> URL: https://tracker.ceph.com/issues/63586 >>>>> Signed-off-by: Xiubo Li <xiubli@redhat.com> >>>>> --- >>>>> include/linux/ceph/messenger.h | 1 + >>>>> net/ceph/messenger_v1.c | 32 +++++++++++++++++++++----------- >>>>> 2 files changed, 22 insertions(+), 11 deletions(-) >>>>> >>>>> diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h >>>>> index 2eaaabbe98cb..ca6f82abed62 100644 >>>>> --- a/include/linux/ceph/messenger.h >>>>> +++ b/include/linux/ceph/messenger.h >>>>> @@ -231,6 +231,7 @@ struct ceph_msg_data { >>>>> >>>>> struct ceph_msg_data_cursor { >>>>> size_t total_resid; /* across all data items */ >>>>> + size_t sr_total_resid; /* across all data items for sparse-read */ >>>>> >>>>> struct ceph_msg_data *data; /* current data item */ >>>>> size_t resid; /* bytes not yet consumed */ >>>>> diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c >>>>> index 4cb60bacf5f5..2733da891688 100644 >>>>> --- a/net/ceph/messenger_v1.c >>>>> +++ b/net/ceph/messenger_v1.c >>>>> @@ -160,7 +160,9 @@ static size_t sizeof_footer(struct ceph_connection *con) >>>>> static void prepare_message_data(struct ceph_msg *msg, u32 data_len) >>>>> { >>>>> /* Initialize data cursor if it's not a sparse read */ >>>>> - if (!msg->sparse_read) >>>>> + if (msg->sparse_read) >>>>> + msg->cursor.sr_total_resid = data_len; >>>>> + else >>>>> ceph_msg_data_cursor_init(&msg->cursor, msg, data_len); >>>>> } >>>>> >>>>> >>>> Sorry, disregard my last email. >>>> >>>> I went and looked at the patch in the tree, and I better understand what >>>> you're trying to do. We already have a value that gets set to data_len >>>> called total_resid, but that is a nonsense value in a sparse read, >>>> because we can advance farther through the receive buffer than the >>>> amount of data in the socket. >>> Hi Jeff, >>> >>> I see that total_resid is set to sparse_data_requested(), which is >>> effectively the size of the receive buffer, not data_len. (This is >>> ignoring the seemingly unnecessary complication of trying to support >>> normal reads mixed with sparse reads in the same message, which I'm >>> pretty sure doesn't work anyway.) >>> >> Oh right. I missed that bit when I was re-reviewing this. Once we're in >> a sparse read we'll override that value. Ok, so maybe we don't need >> sr_total_resid. >> Oh, I get you now. Yeah we can just reuse the 'total_resid' instead of adding a new one. >>> With that, total_resid should represent the amount that needs to be >>> filled in (advanced through) in the receive buffer. When total_resid >>> reaches 0, wouldn't that mean that the amount of data in the socket is >>> also 0? If not, where would the remaining data in the socket go? >>> >> With a properly formed reply, then I think yes, there should be no >> remaining data in the socket at the end of the receive. > There would be no actual data, but a message footer which follows the > data section and ends the message would be outstanding. Yeah, correct. >> At this point I think I must just be confused about the actual problem. >> I think I need a detailed description of it before I can properly review >> this patch. > AFAIU the problem is that a short read may occur while reading the > message footer from the socket. Later, when the socket is ready for > another read, the messenger invokes all read_partial* handlers, > including read_partial_sparse_msg_data(). The contract between the > messenger and these handlers is that the handler should bail if the > area of the message it's responsible for is already processed. So, > in this case, it's expected that read_sparse_msg_data() would bail, > allowing the messenger to invoke read_partial() for the footer and > pick up where it left off. > > However read_sparse_msg_data() violates that contract and ends up > calling into the state machine in the OSD client. The state machine > assumes that it's a new op and interprets some piece of the footer (or > even completely random memory?) as the sparse-read header and returns > bogus extent length, etc. > > (BTW it's why I suggested the rename from read_sparse_msg_data() to > read_partial_sparse_msg_data() in another patch -- to highlight that > it's one of the "partial" handlers and the respective behavior.) Yeah, Ilya is correct. Thanks - Xiubo > Thanks, > > Ilya >
diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 2eaaabbe98cb..ca6f82abed62 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -231,6 +231,7 @@ struct ceph_msg_data { struct ceph_msg_data_cursor { size_t total_resid; /* across all data items */ + size_t sr_total_resid; /* across all data items for sparse-read */ struct ceph_msg_data *data; /* current data item */ size_t resid; /* bytes not yet consumed */ diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c index 4cb60bacf5f5..2733da891688 100644 --- a/net/ceph/messenger_v1.c +++ b/net/ceph/messenger_v1.c @@ -160,7 +160,9 @@ static size_t sizeof_footer(struct ceph_connection *con) static void prepare_message_data(struct ceph_msg *msg, u32 data_len) { /* Initialize data cursor if it's not a sparse read */ - if (!msg->sparse_read) + if (msg->sparse_read) + msg->cursor.sr_total_resid = data_len; + else ceph_msg_data_cursor_init(&msg->cursor, msg, data_len); } @@ -1032,35 +1034,43 @@ static int read_partial_sparse_msg_data(struct ceph_connection *con) bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC); u32 crc = 0; int ret = 1; + int len; if (do_datacrc) crc = con->in_data_crc; - do { - if (con->v1.in_sr_kvec.iov_base) + while (cursor->sr_total_resid) { + len = 0; + if (con->v1.in_sr_kvec.iov_base) { + len = con->v1.in_sr_kvec.iov_len; ret = read_partial_message_chunk(con, &con->v1.in_sr_kvec, con->v1.in_sr_len, &crc); - else if (cursor->sr_resid > 0) + len = con->v1.in_sr_kvec.iov_len - len; + } else if (cursor->sr_resid > 0) { + len = cursor->sr_resid; ret = read_partial_sparse_msg_extent(con, &crc); - - if (ret <= 0) { - if (do_datacrc) - con->in_data_crc = crc; - return ret; + len -= cursor->sr_resid; } + cursor->sr_total_resid -= len; + if (ret <= 0) + break; memset(&con->v1.in_sr_kvec, 0, sizeof(con->v1.in_sr_kvec)); ret = con->ops->sparse_read(con, cursor, (char **)&con->v1.in_sr_kvec.iov_base); + if (ret <= 0) { + ret = ret ? : 1; /* must return > 0 to indicate success */ + break; + } con->v1.in_sr_len = ret; - } while (ret > 0); + } if (do_datacrc) con->in_data_crc = crc; - return ret < 0 ? ret : 1; /* must return > 0 to indicate success */ + return ret; } static int read_partial_msg_data(struct ceph_connection *con)