diff mbox series

ceph: make num_fwd and num_retry to __u32

Message ID 20230725104438.386461-1-xiubli@redhat.com (mailing list archive)
State New, archived
Headers show
Series ceph: make num_fwd and num_retry to __u32 | expand

Commit Message

Xiubo Li July 25, 2023, 10:44 a.m. UTC
From: Xiubo Li <xiubli@redhat.com>

The num_fwd in MClientRequestForward is int32_t, while the num_fwd
in ceph_mds_request_head is __u8. This is buggy when the num_fwd
is larger than 256 it will always be truncate to 0 again. But the
client couldn't recoginize this.

This will make them to __u32 instead. Because the old cephs will
directly copy the raw memories when decoding the reqeust's head,
so we need to make sure this kclient will be compatible with old
cephs. For newer cephs they will decode the requests depending
the version, which will be much simpler and easier to extend new
members.

URL: https://tracker.ceph.com/issues/62145
Signed-off-by: Xiubo Li <xiubli@redhat.com>
---
 fs/ceph/mds_client.c         | 191 ++++++++++++++++++-----------------
 fs/ceph/mds_client.h         |   4 +-
 include/linux/ceph/ceph_fs.h |  23 ++++-
 3 files changed, 124 insertions(+), 94 deletions(-)

Comments

Alexander Mikhalitsyn July 26, 2023, 10:06 a.m. UTC | #1
Hi Xiubo!

On Tue, Jul 25, 2023 at 12:46 PM <xiubli@redhat.com> wrote:
>
> From: Xiubo Li <xiubli@redhat.com>
>
> The num_fwd in MClientRequestForward is int32_t, while the num_fwd
> in ceph_mds_request_head is __u8. This is buggy when the num_fwd
> is larger than 256 it will always be truncate to 0 again. But the
> client couldn't recoginize this.
>
> This will make them to __u32 instead. Because the old cephs will
> directly copy the raw memories when decoding the reqeust's head,
> so we need to make sure this kclient will be compatible with old
> cephs. For newer cephs they will decode the requests depending
> the version, which will be much simpler and easier to extend new
> members.
>
> URL: https://tracker.ceph.com/issues/62145
> Signed-off-by: Xiubo Li <xiubli@redhat.com>
> ---
>  fs/ceph/mds_client.c         | 191 ++++++++++++++++++-----------------
>  fs/ceph/mds_client.h         |   4 +-
>  include/linux/ceph/ceph_fs.h |  23 ++++-
>  3 files changed, 124 insertions(+), 94 deletions(-)
>
> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> index 70987b3c198a..191bae3a4ee6 100644
> --- a/fs/ceph/mds_client.c
> +++ b/fs/ceph/mds_client.c
> @@ -2897,6 +2897,18 @@ static void encode_mclientrequest_tail(void **p, const struct ceph_mds_request *
>         }
>  }
>
> +static struct ceph_mds_request_head_legacy *
> +find_legacy_request_head(void *p, u64 features)
> +{
> +       bool legacy = !(features & CEPH_FEATURE_FS_BTIME);
> +       struct ceph_mds_request_head_old *ohead;
> +
> +       if (legacy)
> +               return (struct ceph_mds_request_head_legacy *)p;
> +       ohead = (struct ceph_mds_request_head_old *)p;
> +       return (struct ceph_mds_request_head_legacy *)&ohead->oldest_client_tid;
> +}
> +
>  /*
>   * called under mdsc->mutex
>   */
> @@ -2907,7 +2919,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
>         int mds = session->s_mds;
>         struct ceph_mds_client *mdsc = session->s_mdsc;
>         struct ceph_msg *msg;
> -       struct ceph_mds_request_head_old *head;
> +       struct ceph_mds_request_head_legacy *lhead;
>         const char *path1 = NULL;
>         const char *path2 = NULL;
>         u64 ino1 = 0, ino2 = 0;
> @@ -2919,6 +2931,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
>         void *p, *end;
>         int ret;
>         bool legacy = !(session->s_con.peer_features & CEPH_FEATURE_FS_BTIME);
> +       bool old_version = !test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD, &session->s_features);
>
>         ret = set_request_path_attr(mdsc, req->r_inode, req->r_dentry,
>                               req->r_parent, req->r_path1, req->r_ino1.ino,
> @@ -2950,7 +2963,19 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
>                 goto out_free2;
>         }
>
> -       len = legacy ? sizeof(*head) : sizeof(struct ceph_mds_request_head);
> +       /*
> +        * For old cephs without supporting the 32bit retry/fwd feature
> +        * it will copy the raw memories directly when decoding the
> +        * requests. While new cephs will decode the head depending the
> +        * version member, so we need to make sure it will be compatible
> +        * with them both.
> +        */
> +       if (legacy)
> +               len = sizeof(struct ceph_mds_request_head_legacy);
> +       else if (old_version)
> +               len = sizeof(struct ceph_mds_request_head_old);
> +       else
> +               len = sizeof(struct ceph_mds_request_head);
>
>         /* filepaths */
>         len += 2 * (1 + sizeof(u32) + sizeof(u64));
> @@ -2995,33 +3020,40 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
>
>         msg->hdr.tid = cpu_to_le64(req->r_tid);
>
> +       lhead = find_legacy_request_head(msg->front.iov_base,
> +                                        session->s_con.peer_features);
> +
>         /*
> -        * The old ceph_mds_request_head didn't contain a version field, and
> +        * The ceph_mds_request_head_legacy didn't contain a version field, and
>          * one was added when we moved the message version from 3->4.
>          */
>         if (legacy) {
>                 msg->hdr.version = cpu_to_le16(3);
> -               head = msg->front.iov_base;
> -               p = msg->front.iov_base + sizeof(*head);
> +               p = msg->front.iov_base + sizeof(*lhead);
> +       } else if (old_version) {
> +               struct ceph_mds_request_head_old *ohead = msg->front.iov_base;
> +
> +               msg->hdr.version = cpu_to_le16(4);
> +               ohead->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION);

Don't we want to use the old mds request head version here:
ohead->version = cpu_to_le16(1);
?

As far as I understand the idea here is to skip new fields
ext_num_retry/ext_num_fwd in case
when the old_version is true. Would it be incorrect to set version to
the latest one (CEPH_MDS_REQUEST_HEAD_VERSION)
and at the same time skip the setting of new fields?

Kind regards,
Alex

> +               p = msg->front.iov_base + sizeof(*ohead);
>         } else {
> -               struct ceph_mds_request_head *new_head = msg->front.iov_base;
> +               struct ceph_mds_request_head *nhead = msg->front.iov_base;
>
>                 msg->hdr.version = cpu_to_le16(6);
> -               new_head->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION);
> -               head = (struct ceph_mds_request_head_old *)&new_head->oldest_client_tid;
> -               p = msg->front.iov_base + sizeof(*new_head);
> +               nhead->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION);
> +               p = msg->front.iov_base + sizeof(*nhead);
>         }
>
>         end = msg->front.iov_base + msg->front.iov_len;
>
> -       head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
> -       head->op = cpu_to_le32(req->r_op);
> -       head->caller_uid = cpu_to_le32(from_kuid(&init_user_ns,
> -                                                req->r_cred->fsuid));
> -       head->caller_gid = cpu_to_le32(from_kgid(&init_user_ns,
> -                                                req->r_cred->fsgid));
> -       head->ino = cpu_to_le64(req->r_deleg_ino);
> -       head->args = req->r_args;
> +       lhead->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
> +       lhead->op = cpu_to_le32(req->r_op);
> +       lhead->caller_uid = cpu_to_le32(from_kuid(&init_user_ns,
> +                                                 req->r_cred->fsuid));
> +       lhead->caller_gid = cpu_to_le32(from_kgid(&init_user_ns,
> +                                                 req->r_cred->fsgid));
> +       lhead->ino = cpu_to_le64(req->r_deleg_ino);
> +       lhead->args = req->r_args;
>
>         ceph_encode_filepath(&p, end, ino1, path1);
>         ceph_encode_filepath(&p, end, ino2, path2);
> @@ -3063,7 +3095,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
>                 p = msg->front.iov_base + req->r_request_release_offset;
>         }
>
> -       head->num_releases = cpu_to_le16(releases);
> +       lhead->num_releases = cpu_to_le16(releases);
>
>         encode_mclientrequest_tail(&p, req);
>
> @@ -3114,18 +3146,6 @@ static void complete_request(struct ceph_mds_client *mdsc,
>         complete_all(&req->r_completion);
>  }
>
> -static struct ceph_mds_request_head_old *
> -find_old_request_head(void *p, u64 features)
> -{
> -       bool legacy = !(features & CEPH_FEATURE_FS_BTIME);
> -       struct ceph_mds_request_head *new_head;
> -
> -       if (legacy)
> -               return (struct ceph_mds_request_head_old *)p;
> -       new_head = (struct ceph_mds_request_head *)p;
> -       return (struct ceph_mds_request_head_old *)&new_head->oldest_client_tid;
> -}
> -
>  /*
>   * called under mdsc->mutex
>   */
> @@ -3136,30 +3156,26 @@ static int __prepare_send_request(struct ceph_mds_session *session,
>         int mds = session->s_mds;
>         struct ceph_mds_client *mdsc = session->s_mdsc;
>         struct ceph_client *cl = mdsc->fsc->client;
> -       struct ceph_mds_request_head_old *rhead;
> +       struct ceph_mds_request_head_legacy *lhead;
> +       struct ceph_mds_request_head *nhead;
>         struct ceph_msg *msg;
> -       int flags = 0, max_retry;
> +       int flags = 0, old_max_retry;
> +       bool old_version = !test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD, &session->s_features);
>
> -       /*
> -        * The type of 'r_attempts' in kernel 'ceph_mds_request'
> -        * is 'int', while in 'ceph_mds_request_head' the type of
> -        * 'num_retry' is '__u8'. So in case the request retries
> -        *  exceeding 256 times, the MDS will receive a incorrect
> -        *  retry seq.
> -        *
> -        * In this case it's ususally a bug in MDS and continue
> -        * retrying the request makes no sense.
> -        *
> -        * In future this could be fixed in ceph code, so avoid
> -        * using the hardcode here.
> +       /* Avoid inifinite retrying after overflow. The client will
> +        * increase the retry count and if the MDS is old version,
> +        * so we limit to retry at most 256 times.
>          */
> -       max_retry = sizeof_field(struct ceph_mds_request_head, num_retry);
> -       max_retry = 1 << (max_retry * BITS_PER_BYTE);
> -       if (req->r_attempts >= max_retry) {
> -               pr_warn_ratelimited_client(cl, "request tid %llu seq overflow\n",
> -                                          req->r_tid);
> -               return -EMULTIHOP;
> -       }
> +       if (req->r_attempts) {
> +               old_max_retry = sizeof_field(struct ceph_mds_request_head_old, num_retry);
> +               old_max_retry = 1 << (old_max_retry * BITS_PER_BYTE);
> +               if ((old_version && req->r_attempts >= old_max_retry) ||
> +                   ((uint32_t)req->r_attempts >= U32_MAX)) {
> +                       pr_warn_ratelimited_client(cl, "%s request tid %llu seq overflow\n",
> +                                                 __func__, req->r_tid);
> +                       return -EMULTIHOP;
> +               }
> +        }
>
>         req->r_attempts++;
>         if (req->r_inode) {
> @@ -3184,20 +3200,24 @@ static int __prepare_send_request(struct ceph_mds_session *session,
>                  * d_move mangles the src name.
>                  */
>                 msg = req->r_request;
> -               rhead = find_old_request_head(msg->front.iov_base,
> -                                             session->s_con.peer_features);
> +               lhead = find_legacy_request_head(msg->front.iov_base,
> +                                                session->s_con.peer_features);
>
> -               flags = le32_to_cpu(rhead->flags);
> +               flags = le32_to_cpu(lhead->flags);
>                 flags |= CEPH_MDS_FLAG_REPLAY;
> -               rhead->flags = cpu_to_le32(flags);
> +               lhead->flags = cpu_to_le32(flags);
>
>                 if (req->r_target_inode)
> -                       rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
> +                       lhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
>
> -               rhead->num_retry = req->r_attempts - 1;
> +               lhead->num_retry = req->r_attempts - 1;
> +               if (!old_version) {
> +                       nhead = (struct ceph_mds_request_head*)msg->front.iov_base;
> +                       nhead->ext_num_retry = cpu_to_le32(req->r_attempts - 1);
> +               }
>
>                 /* remove cap/dentry releases from message */
> -               rhead->num_releases = 0;
> +               lhead->num_releases = 0;
>
>                 p = msg->front.iov_base + req->r_request_release_offset;
>                 encode_mclientrequest_tail(&p, req);
> @@ -3218,18 +3238,23 @@ static int __prepare_send_request(struct ceph_mds_session *session,
>         }
>         req->r_request = msg;
>
> -       rhead = find_old_request_head(msg->front.iov_base,
> -                                     session->s_con.peer_features);
> -       rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));
> +       lhead = find_legacy_request_head(msg->front.iov_base,
> +                                        session->s_con.peer_features);
> +       lhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));
>         if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags))
>                 flags |= CEPH_MDS_FLAG_REPLAY;
>         if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags))
>                 flags |= CEPH_MDS_FLAG_ASYNC;
>         if (req->r_parent)
>                 flags |= CEPH_MDS_FLAG_WANT_DENTRY;
> -       rhead->flags = cpu_to_le32(flags);
> -       rhead->num_fwd = req->r_num_fwd;
> -       rhead->num_retry = req->r_attempts - 1;
> +       lhead->flags = cpu_to_le32(flags);
> +       lhead->num_fwd = req->r_num_fwd;
> +       lhead->num_retry = req->r_attempts - 1;
> +       if (!old_version) {
> +               nhead = (struct ceph_mds_request_head*)msg->front.iov_base;
> +               nhead->ext_num_fwd = cpu_to_le32(req->r_num_fwd);
> +               nhead->ext_num_retry = cpu_to_le32(req->r_attempts - 1);
> +       }
>
>         doutc(cl, " r_parent = %p\n", req->r_parent);
>         return 0;
> @@ -3893,34 +3918,20 @@ static void handle_forward(struct ceph_mds_client *mdsc,
>         if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
>                 doutc(cl, "forward tid %llu aborted, unregistering\n", tid);
>                 __unregister_request(mdsc, req);
> -       } else if (fwd_seq <= req->r_num_fwd) {
> -               /*
> -                * The type of 'num_fwd' in ceph 'MClientRequestForward'
> -                * is 'int32_t', while in 'ceph_mds_request_head' the
> -                * type is '__u8'. So in case the request bounces between
> -                * MDSes exceeding 256 times, the client will get stuck.
> -                *
> -                * In this case it's ususally a bug in MDS and continue
> -                * bouncing the request makes no sense.
> +       } else if (fwd_seq <= req->r_num_fwd || (uint32_t)fwd_seq >= U32_MAX) {
> +               /* Avoid inifinite retrying after overflow.
>                  *
> -                * In future this could be fixed in ceph code, so avoid
> -                * using the hardcode here.
> +                * The MDS will increase the fwd count and in client side
> +                * if the num_fwd is less than the one saved in request
> +                * that means the MDS is an old version and overflowed of
> +                * 8 bits.
>                  */
> -               int max = sizeof_field(struct ceph_mds_request_head, num_fwd);
> -               max = 1 << (max * BITS_PER_BYTE);
> -               if (req->r_num_fwd >= max) {
> -                       mutex_lock(&req->r_fill_mutex);
> -                       req->r_err = -EMULTIHOP;
> -                       set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
> -                       mutex_unlock(&req->r_fill_mutex);
> -                       aborted = true;
> -                       pr_warn_ratelimited_client(cl,
> -                                       "forward tid %llu seq overflow\n",
> -                                       tid);
> -               } else {
> -                       doutc(cl, "forward tid %llu to mds%d - old seq %d <= %d\n",
> -                             tid, next_mds, req->r_num_fwd, fwd_seq);
> -               }
> +               mutex_lock(&req->r_fill_mutex);
> +               req->r_err = -EMULTIHOP;
> +               set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
> +               mutex_unlock(&req->r_fill_mutex);
> +               aborted = true;
> +               pr_warn_ratelimited_client(cl, "forward tid %llu seq overflow\n", tid);
>         } else {
>                 /* resend. forward race not possible; mds would drop */
>                 doutc(cl, "forward tid %llu to mds%d (we resend)\n", tid, next_mds);
> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> index befbd384428e..717a7399bacb 100644
> --- a/fs/ceph/mds_client.h
> +++ b/fs/ceph/mds_client.h
> @@ -32,8 +32,9 @@ enum ceph_feature_type {
>         CEPHFS_FEATURE_ALTERNATE_NAME,
>         CEPHFS_FEATURE_NOTIFY_SESSION_STATE,
>         CEPHFS_FEATURE_OP_GETVXATTR,
> +       CEPHFS_FEATURE_32BITS_RETRY_FWD,
>
> -       CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_OP_GETVXATTR,
> +       CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_32BITS_RETRY_FWD,
>  };
>
>  #define CEPHFS_FEATURES_CLIENT_SUPPORTED {     \
> @@ -47,6 +48,7 @@ enum ceph_feature_type {
>         CEPHFS_FEATURE_ALTERNATE_NAME,          \
>         CEPHFS_FEATURE_NOTIFY_SESSION_STATE,    \
>         CEPHFS_FEATURE_OP_GETVXATTR,            \
> +       CEPHFS_FEATURE_32BITS_RETRY_FWD,        \
>  }
>
>  /*
> diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
> index 45f8ce61e103..f3b3593254b9 100644
> --- a/include/linux/ceph/ceph_fs.h
> +++ b/include/linux/ceph/ceph_fs.h
> @@ -484,7 +484,7 @@ union ceph_mds_request_args_ext {
>  #define CEPH_MDS_FLAG_WANT_DENTRY      2 /* want dentry in reply */
>  #define CEPH_MDS_FLAG_ASYNC            4 /* request is asynchronous */
>
> -struct ceph_mds_request_head_old {
> +struct ceph_mds_request_head_legacy {
>         __le64 oldest_client_tid;
>         __le32 mdsmap_epoch;           /* on client */
>         __le32 flags;                  /* CEPH_MDS_FLAG_* */
> @@ -497,9 +497,9 @@ struct ceph_mds_request_head_old {
>         union ceph_mds_request_args args;
>  } __attribute__ ((packed));
>
> -#define CEPH_MDS_REQUEST_HEAD_VERSION  1
> +#define CEPH_MDS_REQUEST_HEAD_VERSION  2
>
> -struct ceph_mds_request_head {
> +struct ceph_mds_request_head_old {
>         __le16 version;                /* struct version */
>         __le64 oldest_client_tid;
>         __le32 mdsmap_epoch;           /* on client */
> @@ -513,6 +513,23 @@ struct ceph_mds_request_head {
>         union ceph_mds_request_args_ext args;
>  } __attribute__ ((packed));
>
> +struct ceph_mds_request_head {
> +       __le16 version;                /* struct version */
> +       __le64 oldest_client_tid;
> +       __le32 mdsmap_epoch;           /* on client */
> +       __le32 flags;                  /* CEPH_MDS_FLAG_* */
> +       __u8 num_retry, num_fwd;       /* legacy count retry and fwd attempts */
> +       __le16 num_releases;           /* # include cap/lease release records */
> +       __le32 op;                     /* mds op code */
> +       __le32 caller_uid, caller_gid;
> +       __le64 ino;                    /* use this ino for openc, mkdir, mknod,
> +                                         etc. (if replaying) */
> +       union ceph_mds_request_args_ext args;
> +
> +       __le32 ext_num_retry;          /* new count retry attempts */
> +       __le32 ext_num_fwd;            /* new count fwd attempts */
> +} __attribute__ ((packed));
> +
>  /* cap/lease release record */
>  struct ceph_mds_request_release {
>         __le64 ino, cap_id;            /* ino and unique cap id */
> --
> 2.40.1
>
Xiubo Li July 26, 2023, 1:16 p.m. UTC | #2
On 7/26/23 18:06, Aleksandr Mikhalitsyn wrote:
> Hi Xiubo!
>
> On Tue, Jul 25, 2023 at 12:46 PM <xiubli@redhat.com> wrote:
>> From: Xiubo Li <xiubli@redhat.com>
>>
>> The num_fwd in MClientRequestForward is int32_t, while the num_fwd
>> in ceph_mds_request_head is __u8. This is buggy when the num_fwd
>> is larger than 256 it will always be truncate to 0 again. But the
>> client couldn't recoginize this.
>>
>> This will make them to __u32 instead. Because the old cephs will
>> directly copy the raw memories when decoding the reqeust's head,
>> so we need to make sure this kclient will be compatible with old
>> cephs. For newer cephs they will decode the requests depending
>> the version, which will be much simpler and easier to extend new
>> members.
>>
>> URL: https://tracker.ceph.com/issues/62145
>> Signed-off-by: Xiubo Li <xiubli@redhat.com>
>> ---
>>   fs/ceph/mds_client.c         | 191 ++++++++++++++++++-----------------
>>   fs/ceph/mds_client.h         |   4 +-
>>   include/linux/ceph/ceph_fs.h |  23 ++++-
>>   3 files changed, 124 insertions(+), 94 deletions(-)
>>
>> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
>> index 70987b3c198a..191bae3a4ee6 100644
>> --- a/fs/ceph/mds_client.c
>> +++ b/fs/ceph/mds_client.c
>> @@ -2897,6 +2897,18 @@ static void encode_mclientrequest_tail(void **p, const struct ceph_mds_request *
>>          }
>>   }
>>
>> +static struct ceph_mds_request_head_legacy *
>> +find_legacy_request_head(void *p, u64 features)
>> +{
>> +       bool legacy = !(features & CEPH_FEATURE_FS_BTIME);
>> +       struct ceph_mds_request_head_old *ohead;
>> +
>> +       if (legacy)
>> +               return (struct ceph_mds_request_head_legacy *)p;
>> +       ohead = (struct ceph_mds_request_head_old *)p;
>> +       return (struct ceph_mds_request_head_legacy *)&ohead->oldest_client_tid;
>> +}
>> +
>>   /*
>>    * called under mdsc->mutex
>>    */
>> @@ -2907,7 +2919,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
>>          int mds = session->s_mds;
>>          struct ceph_mds_client *mdsc = session->s_mdsc;
>>          struct ceph_msg *msg;
>> -       struct ceph_mds_request_head_old *head;
>> +       struct ceph_mds_request_head_legacy *lhead;
>>          const char *path1 = NULL;
>>          const char *path2 = NULL;
>>          u64 ino1 = 0, ino2 = 0;
>> @@ -2919,6 +2931,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
>>          void *p, *end;
>>          int ret;
>>          bool legacy = !(session->s_con.peer_features & CEPH_FEATURE_FS_BTIME);
>> +       bool old_version = !test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD, &session->s_features);
>>
>>          ret = set_request_path_attr(mdsc, req->r_inode, req->r_dentry,
>>                                req->r_parent, req->r_path1, req->r_ino1.ino,
>> @@ -2950,7 +2963,19 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
>>                  goto out_free2;
>>          }
>>
>> -       len = legacy ? sizeof(*head) : sizeof(struct ceph_mds_request_head);
>> +       /*
>> +        * For old cephs without supporting the 32bit retry/fwd feature
>> +        * it will copy the raw memories directly when decoding the
>> +        * requests. While new cephs will decode the head depending the
>> +        * version member, so we need to make sure it will be compatible
>> +        * with them both.
>> +        */
>> +       if (legacy)
>> +               len = sizeof(struct ceph_mds_request_head_legacy);
>> +       else if (old_version)
>> +               len = sizeof(struct ceph_mds_request_head_old);
>> +       else
>> +               len = sizeof(struct ceph_mds_request_head);
>>
>>          /* filepaths */
>>          len += 2 * (1 + sizeof(u32) + sizeof(u64));
>> @@ -2995,33 +3020,40 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
>>
>>          msg->hdr.tid = cpu_to_le64(req->r_tid);
>>
>> +       lhead = find_legacy_request_head(msg->front.iov_base,
>> +                                        session->s_con.peer_features);
>> +
>>          /*
>> -        * The old ceph_mds_request_head didn't contain a version field, and
>> +        * The ceph_mds_request_head_legacy didn't contain a version field, and
>>           * one was added when we moved the message version from 3->4.
>>           */
>>          if (legacy) {
>>                  msg->hdr.version = cpu_to_le16(3);
>> -               head = msg->front.iov_base;
>> -               p = msg->front.iov_base + sizeof(*head);
>> +               p = msg->front.iov_base + sizeof(*lhead);
>> +       } else if (old_version) {
>> +               struct ceph_mds_request_head_old *ohead = msg->front.iov_base;
>> +
>> +               msg->hdr.version = cpu_to_le16(4);
>> +               ohead->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION);
> Don't we want to use the old mds request head version here:
> ohead->version = cpu_to_le16(1);
> ?
>
> As far as I understand the idea here is to skip new fields
> ext_num_retry/ext_num_fwd in case
> when the old_version is true. Would it be incorrect to set version to
> the latest one (CEPH_MDS_REQUEST_HEAD_VERSION)
> and at the same time skip the setting of new fields?

Hi Alex,

I noticed that, but this doesn't matter, because for the old version 
cephs they will do nothing with the version field. I am just following 
the user space code.

But to make the code to be more readable and easier to understand I will 
revise this. And also will raise one PR to fix the user space code.

Thanks

- Xiubo


> Kind regards,
> Alex
>
>> +               p = msg->front.iov_base + sizeof(*ohead);
>>          } else {
>> -               struct ceph_mds_request_head *new_head = msg->front.iov_base;
>> +               struct ceph_mds_request_head *nhead = msg->front.iov_base;
>>
>>                  msg->hdr.version = cpu_to_le16(6);
>> -               new_head->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION);
>> -               head = (struct ceph_mds_request_head_old *)&new_head->oldest_client_tid;
>> -               p = msg->front.iov_base + sizeof(*new_head);
>> +               nhead->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION);
>> +               p = msg->front.iov_base + sizeof(*nhead);
>>          }
>>
>>          end = msg->front.iov_base + msg->front.iov_len;
>>
>> -       head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
>> -       head->op = cpu_to_le32(req->r_op);
>> -       head->caller_uid = cpu_to_le32(from_kuid(&init_user_ns,
>> -                                                req->r_cred->fsuid));
>> -       head->caller_gid = cpu_to_le32(from_kgid(&init_user_ns,
>> -                                                req->r_cred->fsgid));
>> -       head->ino = cpu_to_le64(req->r_deleg_ino);
>> -       head->args = req->r_args;
>> +       lhead->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
>> +       lhead->op = cpu_to_le32(req->r_op);
>> +       lhead->caller_uid = cpu_to_le32(from_kuid(&init_user_ns,
>> +                                                 req->r_cred->fsuid));
>> +       lhead->caller_gid = cpu_to_le32(from_kgid(&init_user_ns,
>> +                                                 req->r_cred->fsgid));
>> +       lhead->ino = cpu_to_le64(req->r_deleg_ino);
>> +       lhead->args = req->r_args;
>>
>>          ceph_encode_filepath(&p, end, ino1, path1);
>>          ceph_encode_filepath(&p, end, ino2, path2);
>> @@ -3063,7 +3095,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
>>                  p = msg->front.iov_base + req->r_request_release_offset;
>>          }
>>
>> -       head->num_releases = cpu_to_le16(releases);
>> +       lhead->num_releases = cpu_to_le16(releases);
>>
>>          encode_mclientrequest_tail(&p, req);
>>
>> @@ -3114,18 +3146,6 @@ static void complete_request(struct ceph_mds_client *mdsc,
>>          complete_all(&req->r_completion);
>>   }
>>
>> -static struct ceph_mds_request_head_old *
>> -find_old_request_head(void *p, u64 features)
>> -{
>> -       bool legacy = !(features & CEPH_FEATURE_FS_BTIME);
>> -       struct ceph_mds_request_head *new_head;
>> -
>> -       if (legacy)
>> -               return (struct ceph_mds_request_head_old *)p;
>> -       new_head = (struct ceph_mds_request_head *)p;
>> -       return (struct ceph_mds_request_head_old *)&new_head->oldest_client_tid;
>> -}
>> -
>>   /*
>>    * called under mdsc->mutex
>>    */
>> @@ -3136,30 +3156,26 @@ static int __prepare_send_request(struct ceph_mds_session *session,
>>          int mds = session->s_mds;
>>          struct ceph_mds_client *mdsc = session->s_mdsc;
>>          struct ceph_client *cl = mdsc->fsc->client;
>> -       struct ceph_mds_request_head_old *rhead;
>> +       struct ceph_mds_request_head_legacy *lhead;
>> +       struct ceph_mds_request_head *nhead;
>>          struct ceph_msg *msg;
>> -       int flags = 0, max_retry;
>> +       int flags = 0, old_max_retry;
>> +       bool old_version = !test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD, &session->s_features);
>>
>> -       /*
>> -        * The type of 'r_attempts' in kernel 'ceph_mds_request'
>> -        * is 'int', while in 'ceph_mds_request_head' the type of
>> -        * 'num_retry' is '__u8'. So in case the request retries
>> -        *  exceeding 256 times, the MDS will receive a incorrect
>> -        *  retry seq.
>> -        *
>> -        * In this case it's ususally a bug in MDS and continue
>> -        * retrying the request makes no sense.
>> -        *
>> -        * In future this could be fixed in ceph code, so avoid
>> -        * using the hardcode here.
>> +       /* Avoid inifinite retrying after overflow. The client will
>> +        * increase the retry count and if the MDS is old version,
>> +        * so we limit to retry at most 256 times.
>>           */
>> -       max_retry = sizeof_field(struct ceph_mds_request_head, num_retry);
>> -       max_retry = 1 << (max_retry * BITS_PER_BYTE);
>> -       if (req->r_attempts >= max_retry) {
>> -               pr_warn_ratelimited_client(cl, "request tid %llu seq overflow\n",
>> -                                          req->r_tid);
>> -               return -EMULTIHOP;
>> -       }
>> +       if (req->r_attempts) {
>> +               old_max_retry = sizeof_field(struct ceph_mds_request_head_old, num_retry);
>> +               old_max_retry = 1 << (old_max_retry * BITS_PER_BYTE);
>> +               if ((old_version && req->r_attempts >= old_max_retry) ||
>> +                   ((uint32_t)req->r_attempts >= U32_MAX)) {
>> +                       pr_warn_ratelimited_client(cl, "%s request tid %llu seq overflow\n",
>> +                                                 __func__, req->r_tid);
>> +                       return -EMULTIHOP;
>> +               }
>> +        }
>>
>>          req->r_attempts++;
>>          if (req->r_inode) {
>> @@ -3184,20 +3200,24 @@ static int __prepare_send_request(struct ceph_mds_session *session,
>>                   * d_move mangles the src name.
>>                   */
>>                  msg = req->r_request;
>> -               rhead = find_old_request_head(msg->front.iov_base,
>> -                                             session->s_con.peer_features);
>> +               lhead = find_legacy_request_head(msg->front.iov_base,
>> +                                                session->s_con.peer_features);
>>
>> -               flags = le32_to_cpu(rhead->flags);
>> +               flags = le32_to_cpu(lhead->flags);
>>                  flags |= CEPH_MDS_FLAG_REPLAY;
>> -               rhead->flags = cpu_to_le32(flags);
>> +               lhead->flags = cpu_to_le32(flags);
>>
>>                  if (req->r_target_inode)
>> -                       rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
>> +                       lhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
>>
>> -               rhead->num_retry = req->r_attempts - 1;
>> +               lhead->num_retry = req->r_attempts - 1;
>> +               if (!old_version) {
>> +                       nhead = (struct ceph_mds_request_head*)msg->front.iov_base;
>> +                       nhead->ext_num_retry = cpu_to_le32(req->r_attempts - 1);
>> +               }
>>
>>                  /* remove cap/dentry releases from message */
>> -               rhead->num_releases = 0;
>> +               lhead->num_releases = 0;
>>
>>                  p = msg->front.iov_base + req->r_request_release_offset;
>>                  encode_mclientrequest_tail(&p, req);
>> @@ -3218,18 +3238,23 @@ static int __prepare_send_request(struct ceph_mds_session *session,
>>          }
>>          req->r_request = msg;
>>
>> -       rhead = find_old_request_head(msg->front.iov_base,
>> -                                     session->s_con.peer_features);
>> -       rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));
>> +       lhead = find_legacy_request_head(msg->front.iov_base,
>> +                                        session->s_con.peer_features);
>> +       lhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));
>>          if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags))
>>                  flags |= CEPH_MDS_FLAG_REPLAY;
>>          if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags))
>>                  flags |= CEPH_MDS_FLAG_ASYNC;
>>          if (req->r_parent)
>>                  flags |= CEPH_MDS_FLAG_WANT_DENTRY;
>> -       rhead->flags = cpu_to_le32(flags);
>> -       rhead->num_fwd = req->r_num_fwd;
>> -       rhead->num_retry = req->r_attempts - 1;
>> +       lhead->flags = cpu_to_le32(flags);
>> +       lhead->num_fwd = req->r_num_fwd;
>> +       lhead->num_retry = req->r_attempts - 1;
>> +       if (!old_version) {
>> +               nhead = (struct ceph_mds_request_head*)msg->front.iov_base;
>> +               nhead->ext_num_fwd = cpu_to_le32(req->r_num_fwd);
>> +               nhead->ext_num_retry = cpu_to_le32(req->r_attempts - 1);
>> +       }
>>
>>          doutc(cl, " r_parent = %p\n", req->r_parent);
>>          return 0;
>> @@ -3893,34 +3918,20 @@ static void handle_forward(struct ceph_mds_client *mdsc,
>>          if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
>>                  doutc(cl, "forward tid %llu aborted, unregistering\n", tid);
>>                  __unregister_request(mdsc, req);
>> -       } else if (fwd_seq <= req->r_num_fwd) {
>> -               /*
>> -                * The type of 'num_fwd' in ceph 'MClientRequestForward'
>> -                * is 'int32_t', while in 'ceph_mds_request_head' the
>> -                * type is '__u8'. So in case the request bounces between
>> -                * MDSes exceeding 256 times, the client will get stuck.
>> -                *
>> -                * In this case it's ususally a bug in MDS and continue
>> -                * bouncing the request makes no sense.
>> +       } else if (fwd_seq <= req->r_num_fwd || (uint32_t)fwd_seq >= U32_MAX) {
>> +               /* Avoid inifinite retrying after overflow.
>>                   *
>> -                * In future this could be fixed in ceph code, so avoid
>> -                * using the hardcode here.
>> +                * The MDS will increase the fwd count and in client side
>> +                * if the num_fwd is less than the one saved in request
>> +                * that means the MDS is an old version and overflowed of
>> +                * 8 bits.
>>                   */
>> -               int max = sizeof_field(struct ceph_mds_request_head, num_fwd);
>> -               max = 1 << (max * BITS_PER_BYTE);
>> -               if (req->r_num_fwd >= max) {
>> -                       mutex_lock(&req->r_fill_mutex);
>> -                       req->r_err = -EMULTIHOP;
>> -                       set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
>> -                       mutex_unlock(&req->r_fill_mutex);
>> -                       aborted = true;
>> -                       pr_warn_ratelimited_client(cl,
>> -                                       "forward tid %llu seq overflow\n",
>> -                                       tid);
>> -               } else {
>> -                       doutc(cl, "forward tid %llu to mds%d - old seq %d <= %d\n",
>> -                             tid, next_mds, req->r_num_fwd, fwd_seq);
>> -               }
>> +               mutex_lock(&req->r_fill_mutex);
>> +               req->r_err = -EMULTIHOP;
>> +               set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
>> +               mutex_unlock(&req->r_fill_mutex);
>> +               aborted = true;
>> +               pr_warn_ratelimited_client(cl, "forward tid %llu seq overflow\n", tid);
>>          } else {
>>                  /* resend. forward race not possible; mds would drop */
>>                  doutc(cl, "forward tid %llu to mds%d (we resend)\n", tid, next_mds);
>> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
>> index befbd384428e..717a7399bacb 100644
>> --- a/fs/ceph/mds_client.h
>> +++ b/fs/ceph/mds_client.h
>> @@ -32,8 +32,9 @@ enum ceph_feature_type {
>>          CEPHFS_FEATURE_ALTERNATE_NAME,
>>          CEPHFS_FEATURE_NOTIFY_SESSION_STATE,
>>          CEPHFS_FEATURE_OP_GETVXATTR,
>> +       CEPHFS_FEATURE_32BITS_RETRY_FWD,
>>
>> -       CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_OP_GETVXATTR,
>> +       CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_32BITS_RETRY_FWD,
>>   };
>>
>>   #define CEPHFS_FEATURES_CLIENT_SUPPORTED {     \
>> @@ -47,6 +48,7 @@ enum ceph_feature_type {
>>          CEPHFS_FEATURE_ALTERNATE_NAME,          \
>>          CEPHFS_FEATURE_NOTIFY_SESSION_STATE,    \
>>          CEPHFS_FEATURE_OP_GETVXATTR,            \
>> +       CEPHFS_FEATURE_32BITS_RETRY_FWD,        \
>>   }
>>
>>   /*
>> diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
>> index 45f8ce61e103..f3b3593254b9 100644
>> --- a/include/linux/ceph/ceph_fs.h
>> +++ b/include/linux/ceph/ceph_fs.h
>> @@ -484,7 +484,7 @@ union ceph_mds_request_args_ext {
>>   #define CEPH_MDS_FLAG_WANT_DENTRY      2 /* want dentry in reply */
>>   #define CEPH_MDS_FLAG_ASYNC            4 /* request is asynchronous */
>>
>> -struct ceph_mds_request_head_old {
>> +struct ceph_mds_request_head_legacy {
>>          __le64 oldest_client_tid;
>>          __le32 mdsmap_epoch;           /* on client */
>>          __le32 flags;                  /* CEPH_MDS_FLAG_* */
>> @@ -497,9 +497,9 @@ struct ceph_mds_request_head_old {
>>          union ceph_mds_request_args args;
>>   } __attribute__ ((packed));
>>
>> -#define CEPH_MDS_REQUEST_HEAD_VERSION  1
>> +#define CEPH_MDS_REQUEST_HEAD_VERSION  2
>>
>> -struct ceph_mds_request_head {
>> +struct ceph_mds_request_head_old {
>>          __le16 version;                /* struct version */
>>          __le64 oldest_client_tid;
>>          __le32 mdsmap_epoch;           /* on client */
>> @@ -513,6 +513,23 @@ struct ceph_mds_request_head {
>>          union ceph_mds_request_args_ext args;
>>   } __attribute__ ((packed));
>>
>> +struct ceph_mds_request_head {
>> +       __le16 version;                /* struct version */
>> +       __le64 oldest_client_tid;
>> +       __le32 mdsmap_epoch;           /* on client */
>> +       __le32 flags;                  /* CEPH_MDS_FLAG_* */
>> +       __u8 num_retry, num_fwd;       /* legacy count retry and fwd attempts */
>> +       __le16 num_releases;           /* # include cap/lease release records */
>> +       __le32 op;                     /* mds op code */
>> +       __le32 caller_uid, caller_gid;
>> +       __le64 ino;                    /* use this ino for openc, mkdir, mknod,
>> +                                         etc. (if replaying) */
>> +       union ceph_mds_request_args_ext args;
>> +
>> +       __le32 ext_num_retry;          /* new count retry attempts */
>> +       __le32 ext_num_fwd;            /* new count fwd attempts */
>> +} __attribute__ ((packed));
>> +
>>   /* cap/lease release record */
>>   struct ceph_mds_request_release {
>>          __le64 ino, cap_id;            /* ino and unique cap id */
>> --
>> 2.40.1
>>
Alexander Mikhalitsyn July 26, 2023, 1:19 p.m. UTC | #3
On Wed, Jul 26, 2023 at 3:16 PM Xiubo Li <xiubli@redhat.com> wrote:
>
>
> On 7/26/23 18:06, Aleksandr Mikhalitsyn wrote:
> > Hi Xiubo!
> >
> > On Tue, Jul 25, 2023 at 12:46 PM <xiubli@redhat.com> wrote:
> >> From: Xiubo Li <xiubli@redhat.com>
> >>
> >> The num_fwd in MClientRequestForward is int32_t, while the num_fwd
> >> in ceph_mds_request_head is __u8. This is buggy when the num_fwd
> >> is larger than 256 it will always be truncate to 0 again. But the
> >> client couldn't recoginize this.
> >>
> >> This will make them to __u32 instead. Because the old cephs will
> >> directly copy the raw memories when decoding the reqeust's head,
> >> so we need to make sure this kclient will be compatible with old
> >> cephs. For newer cephs they will decode the requests depending
> >> the version, which will be much simpler and easier to extend new
> >> members.
> >>
> >> URL: https://tracker.ceph.com/issues/62145
> >> Signed-off-by: Xiubo Li <xiubli@redhat.com>
> >> ---
> >>   fs/ceph/mds_client.c         | 191 ++++++++++++++++++-----------------
> >>   fs/ceph/mds_client.h         |   4 +-
> >>   include/linux/ceph/ceph_fs.h |  23 ++++-
> >>   3 files changed, 124 insertions(+), 94 deletions(-)
> >>
> >> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> >> index 70987b3c198a..191bae3a4ee6 100644
> >> --- a/fs/ceph/mds_client.c
> >> +++ b/fs/ceph/mds_client.c
> >> @@ -2897,6 +2897,18 @@ static void encode_mclientrequest_tail(void **p, const struct ceph_mds_request *
> >>          }
> >>   }
> >>
> >> +static struct ceph_mds_request_head_legacy *
> >> +find_legacy_request_head(void *p, u64 features)
> >> +{
> >> +       bool legacy = !(features & CEPH_FEATURE_FS_BTIME);
> >> +       struct ceph_mds_request_head_old *ohead;
> >> +
> >> +       if (legacy)
> >> +               return (struct ceph_mds_request_head_legacy *)p;
> >> +       ohead = (struct ceph_mds_request_head_old *)p;
> >> +       return (struct ceph_mds_request_head_legacy *)&ohead->oldest_client_tid;
> >> +}
> >> +
> >>   /*
> >>    * called under mdsc->mutex
> >>    */
> >> @@ -2907,7 +2919,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
> >>          int mds = session->s_mds;
> >>          struct ceph_mds_client *mdsc = session->s_mdsc;
> >>          struct ceph_msg *msg;
> >> -       struct ceph_mds_request_head_old *head;
> >> +       struct ceph_mds_request_head_legacy *lhead;
> >>          const char *path1 = NULL;
> >>          const char *path2 = NULL;
> >>          u64 ino1 = 0, ino2 = 0;
> >> @@ -2919,6 +2931,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
> >>          void *p, *end;
> >>          int ret;
> >>          bool legacy = !(session->s_con.peer_features & CEPH_FEATURE_FS_BTIME);
> >> +       bool old_version = !test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD, &session->s_features);
> >>
> >>          ret = set_request_path_attr(mdsc, req->r_inode, req->r_dentry,
> >>                                req->r_parent, req->r_path1, req->r_ino1.ino,
> >> @@ -2950,7 +2963,19 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
> >>                  goto out_free2;
> >>          }
> >>
> >> -       len = legacy ? sizeof(*head) : sizeof(struct ceph_mds_request_head);
> >> +       /*
> >> +        * For old cephs without supporting the 32bit retry/fwd feature
> >> +        * it will copy the raw memories directly when decoding the
> >> +        * requests. While new cephs will decode the head depending the
> >> +        * version member, so we need to make sure it will be compatible
> >> +        * with them both.
> >> +        */
> >> +       if (legacy)
> >> +               len = sizeof(struct ceph_mds_request_head_legacy);
> >> +       else if (old_version)
> >> +               len = sizeof(struct ceph_mds_request_head_old);
> >> +       else
> >> +               len = sizeof(struct ceph_mds_request_head);
> >>
> >>          /* filepaths */
> >>          len += 2 * (1 + sizeof(u32) + sizeof(u64));
> >> @@ -2995,33 +3020,40 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
> >>
> >>          msg->hdr.tid = cpu_to_le64(req->r_tid);
> >>
> >> +       lhead = find_legacy_request_head(msg->front.iov_base,
> >> +                                        session->s_con.peer_features);
> >> +
> >>          /*
> >> -        * The old ceph_mds_request_head didn't contain a version field, and
> >> +        * The ceph_mds_request_head_legacy didn't contain a version field, and
> >>           * one was added when we moved the message version from 3->4.
> >>           */
> >>          if (legacy) {
> >>                  msg->hdr.version = cpu_to_le16(3);
> >> -               head = msg->front.iov_base;
> >> -               p = msg->front.iov_base + sizeof(*head);
> >> +               p = msg->front.iov_base + sizeof(*lhead);
> >> +       } else if (old_version) {
> >> +               struct ceph_mds_request_head_old *ohead = msg->front.iov_base;
> >> +
> >> +               msg->hdr.version = cpu_to_le16(4);
> >> +               ohead->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION);
> > Don't we want to use the old mds request head version here:
> > ohead->version = cpu_to_le16(1);
> > ?
> >
> > As far as I understand the idea here is to skip new fields
> > ext_num_retry/ext_num_fwd in case
> > when the old_version is true. Would it be incorrect to set version to
> > the latest one (CEPH_MDS_REQUEST_HEAD_VERSION)
> > and at the same time skip the setting of new fields?
>
> Hi Alex,
>
> I noticed that, but this doesn't matter, because for the old version
> cephs they will do nothing with the version field. I am just following
> the user space code.

Got it. Thanks!

Reviewed-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>

>
> But to make the code to be more readable and easier to understand I will
> revise this. And also will raise one PR to fix the user space code.
>
> Thanks
>
> - Xiubo
>
>
> > Kind regards,
> > Alex
> >
> >> +               p = msg->front.iov_base + sizeof(*ohead);
> >>          } else {
> >> -               struct ceph_mds_request_head *new_head = msg->front.iov_base;
> >> +               struct ceph_mds_request_head *nhead = msg->front.iov_base;
> >>
> >>                  msg->hdr.version = cpu_to_le16(6);
> >> -               new_head->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION);
> >> -               head = (struct ceph_mds_request_head_old *)&new_head->oldest_client_tid;
> >> -               p = msg->front.iov_base + sizeof(*new_head);
> >> +               nhead->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION);
> >> +               p = msg->front.iov_base + sizeof(*nhead);
> >>          }
> >>
> >>          end = msg->front.iov_base + msg->front.iov_len;
> >>
> >> -       head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
> >> -       head->op = cpu_to_le32(req->r_op);
> >> -       head->caller_uid = cpu_to_le32(from_kuid(&init_user_ns,
> >> -                                                req->r_cred->fsuid));
> >> -       head->caller_gid = cpu_to_le32(from_kgid(&init_user_ns,
> >> -                                                req->r_cred->fsgid));
> >> -       head->ino = cpu_to_le64(req->r_deleg_ino);
> >> -       head->args = req->r_args;
> >> +       lhead->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
> >> +       lhead->op = cpu_to_le32(req->r_op);
> >> +       lhead->caller_uid = cpu_to_le32(from_kuid(&init_user_ns,
> >> +                                                 req->r_cred->fsuid));
> >> +       lhead->caller_gid = cpu_to_le32(from_kgid(&init_user_ns,
> >> +                                                 req->r_cred->fsgid));
> >> +       lhead->ino = cpu_to_le64(req->r_deleg_ino);
> >> +       lhead->args = req->r_args;
> >>
> >>          ceph_encode_filepath(&p, end, ino1, path1);
> >>          ceph_encode_filepath(&p, end, ino2, path2);
> >> @@ -3063,7 +3095,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
> >>                  p = msg->front.iov_base + req->r_request_release_offset;
> >>          }
> >>
> >> -       head->num_releases = cpu_to_le16(releases);
> >> +       lhead->num_releases = cpu_to_le16(releases);
> >>
> >>          encode_mclientrequest_tail(&p, req);
> >>
> >> @@ -3114,18 +3146,6 @@ static void complete_request(struct ceph_mds_client *mdsc,
> >>          complete_all(&req->r_completion);
> >>   }
> >>
> >> -static struct ceph_mds_request_head_old *
> >> -find_old_request_head(void *p, u64 features)
> >> -{
> >> -       bool legacy = !(features & CEPH_FEATURE_FS_BTIME);
> >> -       struct ceph_mds_request_head *new_head;
> >> -
> >> -       if (legacy)
> >> -               return (struct ceph_mds_request_head_old *)p;
> >> -       new_head = (struct ceph_mds_request_head *)p;
> >> -       return (struct ceph_mds_request_head_old *)&new_head->oldest_client_tid;
> >> -}
> >> -
> >>   /*
> >>    * called under mdsc->mutex
> >>    */
> >> @@ -3136,30 +3156,26 @@ static int __prepare_send_request(struct ceph_mds_session *session,
> >>          int mds = session->s_mds;
> >>          struct ceph_mds_client *mdsc = session->s_mdsc;
> >>          struct ceph_client *cl = mdsc->fsc->client;
> >> -       struct ceph_mds_request_head_old *rhead;
> >> +       struct ceph_mds_request_head_legacy *lhead;
> >> +       struct ceph_mds_request_head *nhead;
> >>          struct ceph_msg *msg;
> >> -       int flags = 0, max_retry;
> >> +       int flags = 0, old_max_retry;
> >> +       bool old_version = !test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD, &session->s_features);
> >>
> >> -       /*
> >> -        * The type of 'r_attempts' in kernel 'ceph_mds_request'
> >> -        * is 'int', while in 'ceph_mds_request_head' the type of
> >> -        * 'num_retry' is '__u8'. So in case the request retries
> >> -        *  exceeding 256 times, the MDS will receive a incorrect
> >> -        *  retry seq.
> >> -        *
> >> -        * In this case it's ususally a bug in MDS and continue
> >> -        * retrying the request makes no sense.
> >> -        *
> >> -        * In future this could be fixed in ceph code, so avoid
> >> -        * using the hardcode here.
> >> +       /* Avoid inifinite retrying after overflow. The client will
> >> +        * increase the retry count and if the MDS is old version,
> >> +        * so we limit to retry at most 256 times.
> >>           */
> >> -       max_retry = sizeof_field(struct ceph_mds_request_head, num_retry);
> >> -       max_retry = 1 << (max_retry * BITS_PER_BYTE);
> >> -       if (req->r_attempts >= max_retry) {
> >> -               pr_warn_ratelimited_client(cl, "request tid %llu seq overflow\n",
> >> -                                          req->r_tid);
> >> -               return -EMULTIHOP;
> >> -       }
> >> +       if (req->r_attempts) {
> >> +               old_max_retry = sizeof_field(struct ceph_mds_request_head_old, num_retry);
> >> +               old_max_retry = 1 << (old_max_retry * BITS_PER_BYTE);
> >> +               if ((old_version && req->r_attempts >= old_max_retry) ||
> >> +                   ((uint32_t)req->r_attempts >= U32_MAX)) {
> >> +                       pr_warn_ratelimited_client(cl, "%s request tid %llu seq overflow\n",
> >> +                                                 __func__, req->r_tid);
> >> +                       return -EMULTIHOP;
> >> +               }
> >> +        }
> >>
> >>          req->r_attempts++;
> >>          if (req->r_inode) {
> >> @@ -3184,20 +3200,24 @@ static int __prepare_send_request(struct ceph_mds_session *session,
> >>                   * d_move mangles the src name.
> >>                   */
> >>                  msg = req->r_request;
> >> -               rhead = find_old_request_head(msg->front.iov_base,
> >> -                                             session->s_con.peer_features);
> >> +               lhead = find_legacy_request_head(msg->front.iov_base,
> >> +                                                session->s_con.peer_features);
> >>
> >> -               flags = le32_to_cpu(rhead->flags);
> >> +               flags = le32_to_cpu(lhead->flags);
> >>                  flags |= CEPH_MDS_FLAG_REPLAY;
> >> -               rhead->flags = cpu_to_le32(flags);
> >> +               lhead->flags = cpu_to_le32(flags);
> >>
> >>                  if (req->r_target_inode)
> >> -                       rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
> >> +                       lhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
> >>
> >> -               rhead->num_retry = req->r_attempts - 1;
> >> +               lhead->num_retry = req->r_attempts - 1;
> >> +               if (!old_version) {
> >> +                       nhead = (struct ceph_mds_request_head*)msg->front.iov_base;
> >> +                       nhead->ext_num_retry = cpu_to_le32(req->r_attempts - 1);
> >> +               }
> >>
> >>                  /* remove cap/dentry releases from message */
> >> -               rhead->num_releases = 0;
> >> +               lhead->num_releases = 0;
> >>
> >>                  p = msg->front.iov_base + req->r_request_release_offset;
> >>                  encode_mclientrequest_tail(&p, req);
> >> @@ -3218,18 +3238,23 @@ static int __prepare_send_request(struct ceph_mds_session *session,
> >>          }
> >>          req->r_request = msg;
> >>
> >> -       rhead = find_old_request_head(msg->front.iov_base,
> >> -                                     session->s_con.peer_features);
> >> -       rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));
> >> +       lhead = find_legacy_request_head(msg->front.iov_base,
> >> +                                        session->s_con.peer_features);
> >> +       lhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));
> >>          if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags))
> >>                  flags |= CEPH_MDS_FLAG_REPLAY;
> >>          if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags))
> >>                  flags |= CEPH_MDS_FLAG_ASYNC;
> >>          if (req->r_parent)
> >>                  flags |= CEPH_MDS_FLAG_WANT_DENTRY;
> >> -       rhead->flags = cpu_to_le32(flags);
> >> -       rhead->num_fwd = req->r_num_fwd;
> >> -       rhead->num_retry = req->r_attempts - 1;
> >> +       lhead->flags = cpu_to_le32(flags);
> >> +       lhead->num_fwd = req->r_num_fwd;
> >> +       lhead->num_retry = req->r_attempts - 1;
> >> +       if (!old_version) {
> >> +               nhead = (struct ceph_mds_request_head*)msg->front.iov_base;
> >> +               nhead->ext_num_fwd = cpu_to_le32(req->r_num_fwd);
> >> +               nhead->ext_num_retry = cpu_to_le32(req->r_attempts - 1);
> >> +       }
> >>
> >>          doutc(cl, " r_parent = %p\n", req->r_parent);
> >>          return 0;
> >> @@ -3893,34 +3918,20 @@ static void handle_forward(struct ceph_mds_client *mdsc,
> >>          if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
> >>                  doutc(cl, "forward tid %llu aborted, unregistering\n", tid);
> >>                  __unregister_request(mdsc, req);
> >> -       } else if (fwd_seq <= req->r_num_fwd) {
> >> -               /*
> >> -                * The type of 'num_fwd' in ceph 'MClientRequestForward'
> >> -                * is 'int32_t', while in 'ceph_mds_request_head' the
> >> -                * type is '__u8'. So in case the request bounces between
> >> -                * MDSes exceeding 256 times, the client will get stuck.
> >> -                *
> >> -                * In this case it's ususally a bug in MDS and continue
> >> -                * bouncing the request makes no sense.
> >> +       } else if (fwd_seq <= req->r_num_fwd || (uint32_t)fwd_seq >= U32_MAX) {
> >> +               /* Avoid inifinite retrying after overflow.
> >>                   *
> >> -                * In future this could be fixed in ceph code, so avoid
> >> -                * using the hardcode here.
> >> +                * The MDS will increase the fwd count and in client side
> >> +                * if the num_fwd is less than the one saved in request
> >> +                * that means the MDS is an old version and overflowed of
> >> +                * 8 bits.
> >>                   */
> >> -               int max = sizeof_field(struct ceph_mds_request_head, num_fwd);
> >> -               max = 1 << (max * BITS_PER_BYTE);
> >> -               if (req->r_num_fwd >= max) {
> >> -                       mutex_lock(&req->r_fill_mutex);
> >> -                       req->r_err = -EMULTIHOP;
> >> -                       set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
> >> -                       mutex_unlock(&req->r_fill_mutex);
> >> -                       aborted = true;
> >> -                       pr_warn_ratelimited_client(cl,
> >> -                                       "forward tid %llu seq overflow\n",
> >> -                                       tid);
> >> -               } else {
> >> -                       doutc(cl, "forward tid %llu to mds%d - old seq %d <= %d\n",
> >> -                             tid, next_mds, req->r_num_fwd, fwd_seq);
> >> -               }
> >> +               mutex_lock(&req->r_fill_mutex);
> >> +               req->r_err = -EMULTIHOP;
> >> +               set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
> >> +               mutex_unlock(&req->r_fill_mutex);
> >> +               aborted = true;
> >> +               pr_warn_ratelimited_client(cl, "forward tid %llu seq overflow\n", tid);
> >>          } else {
> >>                  /* resend. forward race not possible; mds would drop */
> >>                  doutc(cl, "forward tid %llu to mds%d (we resend)\n", tid, next_mds);
> >> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> >> index befbd384428e..717a7399bacb 100644
> >> --- a/fs/ceph/mds_client.h
> >> +++ b/fs/ceph/mds_client.h
> >> @@ -32,8 +32,9 @@ enum ceph_feature_type {
> >>          CEPHFS_FEATURE_ALTERNATE_NAME,
> >>          CEPHFS_FEATURE_NOTIFY_SESSION_STATE,
> >>          CEPHFS_FEATURE_OP_GETVXATTR,
> >> +       CEPHFS_FEATURE_32BITS_RETRY_FWD,
> >>
> >> -       CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_OP_GETVXATTR,
> >> +       CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_32BITS_RETRY_FWD,
> >>   };
> >>
> >>   #define CEPHFS_FEATURES_CLIENT_SUPPORTED {     \
> >> @@ -47,6 +48,7 @@ enum ceph_feature_type {
> >>          CEPHFS_FEATURE_ALTERNATE_NAME,          \
> >>          CEPHFS_FEATURE_NOTIFY_SESSION_STATE,    \
> >>          CEPHFS_FEATURE_OP_GETVXATTR,            \
> >> +       CEPHFS_FEATURE_32BITS_RETRY_FWD,        \
> >>   }
> >>
> >>   /*
> >> diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
> >> index 45f8ce61e103..f3b3593254b9 100644
> >> --- a/include/linux/ceph/ceph_fs.h
> >> +++ b/include/linux/ceph/ceph_fs.h
> >> @@ -484,7 +484,7 @@ union ceph_mds_request_args_ext {
> >>   #define CEPH_MDS_FLAG_WANT_DENTRY      2 /* want dentry in reply */
> >>   #define CEPH_MDS_FLAG_ASYNC            4 /* request is asynchronous */
> >>
> >> -struct ceph_mds_request_head_old {
> >> +struct ceph_mds_request_head_legacy {
> >>          __le64 oldest_client_tid;
> >>          __le32 mdsmap_epoch;           /* on client */
> >>          __le32 flags;                  /* CEPH_MDS_FLAG_* */
> >> @@ -497,9 +497,9 @@ struct ceph_mds_request_head_old {
> >>          union ceph_mds_request_args args;
> >>   } __attribute__ ((packed));
> >>
> >> -#define CEPH_MDS_REQUEST_HEAD_VERSION  1
> >> +#define CEPH_MDS_REQUEST_HEAD_VERSION  2
> >>
> >> -struct ceph_mds_request_head {
> >> +struct ceph_mds_request_head_old {
> >>          __le16 version;                /* struct version */
> >>          __le64 oldest_client_tid;
> >>          __le32 mdsmap_epoch;           /* on client */
> >> @@ -513,6 +513,23 @@ struct ceph_mds_request_head {
> >>          union ceph_mds_request_args_ext args;
> >>   } __attribute__ ((packed));
> >>
> >> +struct ceph_mds_request_head {
> >> +       __le16 version;                /* struct version */
> >> +       __le64 oldest_client_tid;
> >> +       __le32 mdsmap_epoch;           /* on client */
> >> +       __le32 flags;                  /* CEPH_MDS_FLAG_* */
> >> +       __u8 num_retry, num_fwd;       /* legacy count retry and fwd attempts */
> >> +       __le16 num_releases;           /* # include cap/lease release records */
> >> +       __le32 op;                     /* mds op code */
> >> +       __le32 caller_uid, caller_gid;
> >> +       __le64 ino;                    /* use this ino for openc, mkdir, mknod,
> >> +                                         etc. (if replaying) */
> >> +       union ceph_mds_request_args_ext args;
> >> +
> >> +       __le32 ext_num_retry;          /* new count retry attempts */
> >> +       __le32 ext_num_fwd;            /* new count fwd attempts */
> >> +} __attribute__ ((packed));
> >> +
> >>   /* cap/lease release record */
> >>   struct ceph_mds_request_release {
> >>          __le64 ino, cap_id;            /* ino and unique cap id */
> >> --
> >> 2.40.1
> >>
>
diff mbox series

Patch

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 70987b3c198a..191bae3a4ee6 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2897,6 +2897,18 @@  static void encode_mclientrequest_tail(void **p, const struct ceph_mds_request *
 	}
 }
 
+static struct ceph_mds_request_head_legacy *
+find_legacy_request_head(void *p, u64 features)
+{
+	bool legacy = !(features & CEPH_FEATURE_FS_BTIME);
+	struct ceph_mds_request_head_old *ohead;
+
+	if (legacy)
+		return (struct ceph_mds_request_head_legacy *)p;
+	ohead = (struct ceph_mds_request_head_old *)p;
+	return (struct ceph_mds_request_head_legacy *)&ohead->oldest_client_tid;
+}
+
 /*
  * called under mdsc->mutex
  */
@@ -2907,7 +2919,7 @@  static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
 	int mds = session->s_mds;
 	struct ceph_mds_client *mdsc = session->s_mdsc;
 	struct ceph_msg *msg;
-	struct ceph_mds_request_head_old *head;
+	struct ceph_mds_request_head_legacy *lhead;
 	const char *path1 = NULL;
 	const char *path2 = NULL;
 	u64 ino1 = 0, ino2 = 0;
@@ -2919,6 +2931,7 @@  static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
 	void *p, *end;
 	int ret;
 	bool legacy = !(session->s_con.peer_features & CEPH_FEATURE_FS_BTIME);
+	bool old_version = !test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD, &session->s_features);
 
 	ret = set_request_path_attr(mdsc, req->r_inode, req->r_dentry,
 			      req->r_parent, req->r_path1, req->r_ino1.ino,
@@ -2950,7 +2963,19 @@  static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
 		goto out_free2;
 	}
 
-	len = legacy ? sizeof(*head) : sizeof(struct ceph_mds_request_head);
+	/*
+	 * For old cephs without supporting the 32bit retry/fwd feature
+	 * it will copy the raw memories directly when decoding the
+	 * requests. While new cephs will decode the head depending the
+	 * version member, so we need to make sure it will be compatible
+	 * with them both.
+	 */
+	if (legacy)
+		len = sizeof(struct ceph_mds_request_head_legacy);
+	else if (old_version)
+		len = sizeof(struct ceph_mds_request_head_old);
+	else
+		len = sizeof(struct ceph_mds_request_head);
 
 	/* filepaths */
 	len += 2 * (1 + sizeof(u32) + sizeof(u64));
@@ -2995,33 +3020,40 @@  static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
 
 	msg->hdr.tid = cpu_to_le64(req->r_tid);
 
+	lhead = find_legacy_request_head(msg->front.iov_base,
+					 session->s_con.peer_features);
+
 	/*
-	 * The old ceph_mds_request_head didn't contain a version field, and
+	 * The ceph_mds_request_head_legacy didn't contain a version field, and
 	 * one was added when we moved the message version from 3->4.
 	 */
 	if (legacy) {
 		msg->hdr.version = cpu_to_le16(3);
-		head = msg->front.iov_base;
-		p = msg->front.iov_base + sizeof(*head);
+		p = msg->front.iov_base + sizeof(*lhead);
+	} else if (old_version) {
+		struct ceph_mds_request_head_old *ohead = msg->front.iov_base;
+
+		msg->hdr.version = cpu_to_le16(4);
+		ohead->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION);
+		p = msg->front.iov_base + sizeof(*ohead);
 	} else {
-		struct ceph_mds_request_head *new_head = msg->front.iov_base;
+		struct ceph_mds_request_head *nhead = msg->front.iov_base;
 
 		msg->hdr.version = cpu_to_le16(6);
-		new_head->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION);
-		head = (struct ceph_mds_request_head_old *)&new_head->oldest_client_tid;
-		p = msg->front.iov_base + sizeof(*new_head);
+		nhead->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION);
+		p = msg->front.iov_base + sizeof(*nhead);
 	}
 
 	end = msg->front.iov_base + msg->front.iov_len;
 
-	head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
-	head->op = cpu_to_le32(req->r_op);
-	head->caller_uid = cpu_to_le32(from_kuid(&init_user_ns,
-						 req->r_cred->fsuid));
-	head->caller_gid = cpu_to_le32(from_kgid(&init_user_ns,
-						 req->r_cred->fsgid));
-	head->ino = cpu_to_le64(req->r_deleg_ino);
-	head->args = req->r_args;
+	lhead->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
+	lhead->op = cpu_to_le32(req->r_op);
+	lhead->caller_uid = cpu_to_le32(from_kuid(&init_user_ns,
+						  req->r_cred->fsuid));
+	lhead->caller_gid = cpu_to_le32(from_kgid(&init_user_ns,
+						  req->r_cred->fsgid));
+	lhead->ino = cpu_to_le64(req->r_deleg_ino);
+	lhead->args = req->r_args;
 
 	ceph_encode_filepath(&p, end, ino1, path1);
 	ceph_encode_filepath(&p, end, ino2, path2);
@@ -3063,7 +3095,7 @@  static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
 		p = msg->front.iov_base + req->r_request_release_offset;
 	}
 
-	head->num_releases = cpu_to_le16(releases);
+	lhead->num_releases = cpu_to_le16(releases);
 
 	encode_mclientrequest_tail(&p, req);
 
@@ -3114,18 +3146,6 @@  static void complete_request(struct ceph_mds_client *mdsc,
 	complete_all(&req->r_completion);
 }
 
-static struct ceph_mds_request_head_old *
-find_old_request_head(void *p, u64 features)
-{
-	bool legacy = !(features & CEPH_FEATURE_FS_BTIME);
-	struct ceph_mds_request_head *new_head;
-
-	if (legacy)
-		return (struct ceph_mds_request_head_old *)p;
-	new_head = (struct ceph_mds_request_head *)p;
-	return (struct ceph_mds_request_head_old *)&new_head->oldest_client_tid;
-}
-
 /*
  * called under mdsc->mutex
  */
@@ -3136,30 +3156,26 @@  static int __prepare_send_request(struct ceph_mds_session *session,
 	int mds = session->s_mds;
 	struct ceph_mds_client *mdsc = session->s_mdsc;
 	struct ceph_client *cl = mdsc->fsc->client;
-	struct ceph_mds_request_head_old *rhead;
+	struct ceph_mds_request_head_legacy *lhead;
+	struct ceph_mds_request_head *nhead;
 	struct ceph_msg *msg;
-	int flags = 0, max_retry;
+	int flags = 0, old_max_retry;
+	bool old_version = !test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD, &session->s_features);
 
-	/*
-	 * The type of 'r_attempts' in kernel 'ceph_mds_request'
-	 * is 'int', while in 'ceph_mds_request_head' the type of
-	 * 'num_retry' is '__u8'. So in case the request retries
-	 *  exceeding 256 times, the MDS will receive a incorrect
-	 *  retry seq.
-	 *
-	 * In this case it's ususally a bug in MDS and continue
-	 * retrying the request makes no sense.
-	 *
-	 * In future this could be fixed in ceph code, so avoid
-	 * using the hardcode here.
+	/* Avoid inifinite retrying after overflow. The client will
+	 * increase the retry count and if the MDS is old version,
+	 * so we limit to retry at most 256 times.
 	 */
-	max_retry = sizeof_field(struct ceph_mds_request_head, num_retry);
-	max_retry = 1 << (max_retry * BITS_PER_BYTE);
-	if (req->r_attempts >= max_retry) {
-		pr_warn_ratelimited_client(cl, "request tid %llu seq overflow\n",
-					   req->r_tid);
-		return -EMULTIHOP;
-	}
+       if (req->r_attempts) {
+               old_max_retry = sizeof_field(struct ceph_mds_request_head_old, num_retry);
+               old_max_retry = 1 << (old_max_retry * BITS_PER_BYTE);
+               if ((old_version && req->r_attempts >= old_max_retry) ||
+                   ((uint32_t)req->r_attempts >= U32_MAX)) {
+                       pr_warn_ratelimited_client(cl, "%s request tid %llu seq overflow\n",
+						  __func__, req->r_tid);
+                       return -EMULTIHOP;
+               }
+        }
 
 	req->r_attempts++;
 	if (req->r_inode) {
@@ -3184,20 +3200,24 @@  static int __prepare_send_request(struct ceph_mds_session *session,
 		 * d_move mangles the src name.
 		 */
 		msg = req->r_request;
-		rhead = find_old_request_head(msg->front.iov_base,
-					      session->s_con.peer_features);
+		lhead = find_legacy_request_head(msg->front.iov_base,
+						 session->s_con.peer_features);
 
-		flags = le32_to_cpu(rhead->flags);
+		flags = le32_to_cpu(lhead->flags);
 		flags |= CEPH_MDS_FLAG_REPLAY;
-		rhead->flags = cpu_to_le32(flags);
+		lhead->flags = cpu_to_le32(flags);
 
 		if (req->r_target_inode)
-			rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
+			lhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
 
-		rhead->num_retry = req->r_attempts - 1;
+		lhead->num_retry = req->r_attempts - 1;
+		if (!old_version) {
+			nhead = (struct ceph_mds_request_head*)msg->front.iov_base;
+			nhead->ext_num_retry = cpu_to_le32(req->r_attempts - 1);
+		}
 
 		/* remove cap/dentry releases from message */
-		rhead->num_releases = 0;
+		lhead->num_releases = 0;
 
 		p = msg->front.iov_base + req->r_request_release_offset;
 		encode_mclientrequest_tail(&p, req);
@@ -3218,18 +3238,23 @@  static int __prepare_send_request(struct ceph_mds_session *session,
 	}
 	req->r_request = msg;
 
-	rhead = find_old_request_head(msg->front.iov_base,
-				      session->s_con.peer_features);
-	rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));
+	lhead = find_legacy_request_head(msg->front.iov_base,
+					 session->s_con.peer_features);
+	lhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));
 	if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags))
 		flags |= CEPH_MDS_FLAG_REPLAY;
 	if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags))
 		flags |= CEPH_MDS_FLAG_ASYNC;
 	if (req->r_parent)
 		flags |= CEPH_MDS_FLAG_WANT_DENTRY;
-	rhead->flags = cpu_to_le32(flags);
-	rhead->num_fwd = req->r_num_fwd;
-	rhead->num_retry = req->r_attempts - 1;
+	lhead->flags = cpu_to_le32(flags);
+	lhead->num_fwd = req->r_num_fwd;
+	lhead->num_retry = req->r_attempts - 1;
+	if (!old_version) {
+		nhead = (struct ceph_mds_request_head*)msg->front.iov_base;
+		nhead->ext_num_fwd = cpu_to_le32(req->r_num_fwd);
+		nhead->ext_num_retry = cpu_to_le32(req->r_attempts - 1);
+	}
 
 	doutc(cl, " r_parent = %p\n", req->r_parent);
 	return 0;
@@ -3893,34 +3918,20 @@  static void handle_forward(struct ceph_mds_client *mdsc,
 	if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
 		doutc(cl, "forward tid %llu aborted, unregistering\n", tid);
 		__unregister_request(mdsc, req);
-	} else if (fwd_seq <= req->r_num_fwd) {
-		/*
-		 * The type of 'num_fwd' in ceph 'MClientRequestForward'
-		 * is 'int32_t', while in 'ceph_mds_request_head' the
-		 * type is '__u8'. So in case the request bounces between
-		 * MDSes exceeding 256 times, the client will get stuck.
-		 *
-		 * In this case it's ususally a bug in MDS and continue
-		 * bouncing the request makes no sense.
+	} else if (fwd_seq <= req->r_num_fwd || (uint32_t)fwd_seq >= U32_MAX) {
+		/* Avoid inifinite retrying after overflow.
 		 *
-		 * In future this could be fixed in ceph code, so avoid
-		 * using the hardcode here.
+		 * The MDS will increase the fwd count and in client side
+		 * if the num_fwd is less than the one saved in request
+		 * that means the MDS is an old version and overflowed of
+		 * 8 bits.
 		 */
-		int max = sizeof_field(struct ceph_mds_request_head, num_fwd);
-		max = 1 << (max * BITS_PER_BYTE);
-		if (req->r_num_fwd >= max) {
-			mutex_lock(&req->r_fill_mutex);
-			req->r_err = -EMULTIHOP;
-			set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
-			mutex_unlock(&req->r_fill_mutex);
-			aborted = true;
-			pr_warn_ratelimited_client(cl,
-					"forward tid %llu seq overflow\n",
-					tid);
-		} else {
-			doutc(cl, "forward tid %llu to mds%d - old seq %d <= %d\n",
-			      tid, next_mds, req->r_num_fwd, fwd_seq);
-		}
+		mutex_lock(&req->r_fill_mutex);
+		req->r_err = -EMULTIHOP;
+		set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
+		mutex_unlock(&req->r_fill_mutex);
+		aborted = true;
+		pr_warn_ratelimited_client(cl, "forward tid %llu seq overflow\n", tid);
 	} else {
 		/* resend. forward race not possible; mds would drop */
 		doutc(cl, "forward tid %llu to mds%d (we resend)\n", tid, next_mds);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index befbd384428e..717a7399bacb 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -32,8 +32,9 @@  enum ceph_feature_type {
 	CEPHFS_FEATURE_ALTERNATE_NAME,
 	CEPHFS_FEATURE_NOTIFY_SESSION_STATE,
 	CEPHFS_FEATURE_OP_GETVXATTR,
+	CEPHFS_FEATURE_32BITS_RETRY_FWD,
 
-	CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_OP_GETVXATTR,
+	CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_32BITS_RETRY_FWD,
 };
 
 #define CEPHFS_FEATURES_CLIENT_SUPPORTED {	\
@@ -47,6 +48,7 @@  enum ceph_feature_type {
 	CEPHFS_FEATURE_ALTERNATE_NAME,		\
 	CEPHFS_FEATURE_NOTIFY_SESSION_STATE,	\
 	CEPHFS_FEATURE_OP_GETVXATTR,		\
+	CEPHFS_FEATURE_32BITS_RETRY_FWD,	\
 }
 
 /*
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index 45f8ce61e103..f3b3593254b9 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -484,7 +484,7 @@  union ceph_mds_request_args_ext {
 #define CEPH_MDS_FLAG_WANT_DENTRY	2 /* want dentry in reply */
 #define CEPH_MDS_FLAG_ASYNC		4 /* request is asynchronous */
 
-struct ceph_mds_request_head_old {
+struct ceph_mds_request_head_legacy {
 	__le64 oldest_client_tid;
 	__le32 mdsmap_epoch;           /* on client */
 	__le32 flags;                  /* CEPH_MDS_FLAG_* */
@@ -497,9 +497,9 @@  struct ceph_mds_request_head_old {
 	union ceph_mds_request_args args;
 } __attribute__ ((packed));
 
-#define CEPH_MDS_REQUEST_HEAD_VERSION  1
+#define CEPH_MDS_REQUEST_HEAD_VERSION  2
 
-struct ceph_mds_request_head {
+struct ceph_mds_request_head_old {
 	__le16 version;                /* struct version */
 	__le64 oldest_client_tid;
 	__le32 mdsmap_epoch;           /* on client */
@@ -513,6 +513,23 @@  struct ceph_mds_request_head {
 	union ceph_mds_request_args_ext args;
 } __attribute__ ((packed));
 
+struct ceph_mds_request_head {
+	__le16 version;                /* struct version */
+	__le64 oldest_client_tid;
+	__le32 mdsmap_epoch;           /* on client */
+	__le32 flags;                  /* CEPH_MDS_FLAG_* */
+	__u8 num_retry, num_fwd;       /* legacy count retry and fwd attempts */
+	__le16 num_releases;           /* # include cap/lease release records */
+	__le32 op;                     /* mds op code */
+	__le32 caller_uid, caller_gid;
+	__le64 ino;                    /* use this ino for openc, mkdir, mknod,
+					  etc. (if replaying) */
+	union ceph_mds_request_args_ext args;
+
+	__le32 ext_num_retry;          /* new count retry attempts */
+	__le32 ext_num_fwd;            /* new count fwd attempts */
+} __attribute__ ((packed));
+
 /* cap/lease release record */
 struct ceph_mds_request_release {
 	__le64 ino, cap_id;            /* ino and unique cap id */