Message ID | 20191209124715.2255-1-xiubli@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | ceph: retry the same mds later after the new session is opened | expand |
This can be seen almost every time when mounting: 92959 <7>[13220.815822] ceph: mount start 00000000e3b1070a 92960 <6>[13220.817349] libceph: mon2 (1)192.168.195.165:40566 session established 92961 <7>[13220.818639] ceph: handle_map epoch 58 len 889 92962 <7>[13220.818644] ceph: mdsmap_decode 1/3 4216 mds0.5 (1)192.168.195.165:6813 up:active ==> max_mds == 3 92963 <7>[13220.818646] ceph: mdsmap_decode 2/3 4339 mds1.55 (1)192.168.195.165:6815 up:active 92964 <7>[13220.818646] ceph: mdsmap_decode 3/3 4340 mds2.57 (1)192.168.195.165:6814 up:active 92965 <7>[13220.818648] ceph: mdsmap_decode m_enabled: 1, m_damaged: 0, m_num_laggy: 0 92966 <7>[13220.818648] ceph: mdsmap_decode success epoch 58 92967 <7>[13220.818649] ceph: check_new_map new 58 old 0 92968 <6>[13220.820577] libceph: client5819 fsid 6c39c42b-5888-4c6a-ba32-89acfa5a2353 92969 <7>[13220.821827] ceph: mount opening path \t 92970 <7>[13220.821887] ceph: open_root_inode opening '' 92971 <7>[13220.821892] ceph: do_request on 000000005666e7bf 92972 <7>[13220.821893] ceph: submit_request on 000000005666e7bf for inode 0000000057da6992 92973 <7>[13220.821896] ceph: __register_request 000000005666e7bf tid 1 92974 <7>[13220.821898] ceph: __choose_mds 0000000057da6992 is_hash=0 (0) mode 0 92975 <7>[13220.821899] ceph: choose_mds chose random mds0 ====================> random mds0 92976 <7>[13220.821901] ceph: register_session: realloc to 1 92977 <7>[13220.821902] ceph: register_session: mds0 92978 <7>[13220.821905] ceph: mdsc get_session 000000001649fdbd 2 -> 3 92979 <7>[13220.821905] ceph: mdsc con_get 000000001649fdbd ok (3) 92980 <7>[13220.821910] ceph: mdsc get_session 000000001649fdbd 3 -> 4 92981 <7>[13220.821912] ceph: do_request mds0 session 000000001649fdbd state new 92982 <7>[13220.821913] ceph: open_session to mds0 (up:active) 92983 [...] 92984 <7>[13220.822167] ceph: do_request waiting 92985 [...] 92986 <7>[13220.833112] ceph: handle_session mds0 open 000000001649fdbd state opening seq 0 92987 <7>[13220.833113] ceph: renewed_caps mds0 ttl now 4307945924, was fresh, now stale 92988 <7>[13220.833114] ceph: wake request 000000005666e7bf tid 1 92989 <7>[13220.833116] ceph: mdsc put_session 000000001649fdbd 4 -> 3 92990 <7>[13220.833117] ceph: __choose_mds 0000000057da6992 is_hash=0 (0) mode 0 92991 <7>[13220.833118] ceph: choose_mds chose random mds1 ========================> random mds1 92992 <7>[13220.833119] ceph: register_session: realloc to 2 92993 <7>[13220.833121] ceph: register_session: mds1 92994 <7>[13220.833122] ceph: mdsc get_session 00000000534bd3ef 2 -> 3 92995 <7>[13220.833123] ceph: mdsc con_get 00000000534bd3ef ok (3) 92996 <7>[13220.833124] ceph: mdsc get_session 00000000534bd3ef 3 -> 4 92997 <7>[13220.833124] ceph: do_request mds1 session 00000000534bd3ef state new 92998 <7>[13220.833126] ceph: open_session to mds1 (up:active) 92999 [...] 93000 <7>[13220.845883] ceph: handle_session mds1 open 00000000534bd3ef state opening seq 0 93001 <7>[13220.845884] ceph: renewed_caps mds1 ttl now 4307945935, was fresh, now stale 93002 <7>[13220.845885] ceph: wake request 000000005666e7bf tid 1 93003 <7>[13220.845887] ceph: mdsc put_session 00000000534bd3ef 4 -> 3 93004 <7>[13220.845888] ceph: __choose_mds 0000000057da6992 is_hash=0 (0) mode 0 93005 <7>[13220.845889] ceph: choose_mds chose random mds2 ================> random mds2 93006 <7>[13220.845890] ceph: register_session: realloc to 4 93007 <7>[13220.845891] ceph: register_session: mds2 93008 <7>[13220.845892] ceph: mdsc get_session 0000000076259af8 2 -> 3 93009 <7>[13220.845892] ceph: mdsc con_get 0000000076259af8 ok (3) 93010 <7>[13220.845893] ceph: mdsc get_session 0000000076259af8 3 -> 4 93011 <7>[13220.845893] ceph: do_request mds2 session 0000000076259af8 state new 93012 <7>[13220.845894] ceph: open_session to mds2 (up:active) [...] 93014 <7>[13220.852986] ceph: handle_session mds2 open 0000000076259af8 state opening seq 0 93015 <7>[13220.852987] ceph: renewed_caps mds2 ttl now 4307945948, was fresh, now stale 93016 <7>[13220.852988] ceph: wake request 000000005666e7bf tid 1 93017 <7>[13220.852990] ceph: mdsc put_session 0000000076259af8 4 -> 3 93018 <7>[13220.852991] ceph: __choose_mds 0000000057da6992 is_hash=0 (0) mode 0 93019 <7>[13220.852992] ceph: choose_mds chose random mds0 ===============> choose random mds0 again 93020 <7>[13220.852993] ceph: mdsc get_session 0000000076259af8 3 -> 4 93021 <7>[13220.852994] ceph: mdsc get_session 0000000076259af8 4 -> 5 93022 <7>[13220.852994] ceph: do_request mds0 session 0000000076259af8 state open 93023 <7>[13220.852996] ceph: prepare_send_request 000000005666e7bf tid 1 getattr (attempt 1) 93024 <7>[13220.852997] ceph: path 93025 <7>[13220.852998] ceph: r_parent = 0000000057da6992 93026 [...] 93027 <7>[13220.853414] ceph: handle_reply 000000005666e7bf 93028 <7>[13220.853415] ceph: __unregister_request 000000005666e7bf tid 1 93029 <7>[13220.853416] ceph: got safe reply 1, mds0, last request: Yes 93030 <7>[13220.853417] ceph: handle_reply tid 1 result 0 93031 <7>[13220.853421] ceph: fill_trace 000000005666e7bf is_dentry 0 is_target 1 93032 <7>[13220.853424] ceph: alloc_inode 00000000a823bfed 93033 <7>[13220.853427] ceph: get_inode created new inode 00000000a823bfed 1.fffffffffffffffe ino 1 93034 <7>[13220.853428] ceph: get_inode on 1=1.fffffffffffffffe got 00000000a823bfed 93035 <7>[13220.853429] ceph: fill_inode 00000000a823bfed ino 1.fffffffffffffffe v 81286 had 0 93036 <7>[13220.853431] ceph: 00000000a823bfed mode 040755 uid.gid 0.0 93037 <7>[13220.853433] ceph: truncate_size 0 -> 18446744073709551615 93038 <7>[13220.853435] ceph: fill_trace done err=0 93039 <7>[13220.853441] ceph: mdsc con_put 0000000076259af8 (4) 93040 <7>[13220.853442] ceph: mdsc put_session 0000000076259af8 5 -> 4 93041 <7>[13220.853446] ceph: do_request waited, got 0 93042 <7>[13220.853447] ceph: do_request 000000005666e7bf done, result 0 93043 <7>[13220.853447] ceph: open_root_inode success 93044 <7>[13220.853452] ceph: open_root_inode success, root dentry is 0000000037517481 [...] 93050 <7>[13220.853457] ceph: mount success On 2019/12/9 20:47, xiubli@redhat.com wrote: > From: Xiubo Li<xiubli@redhat.com> > > With max_mds > 1 and for a request which are choosing a random > mds rank and if the relating session is not opened yet, the request > will wait the session been opened and resend again. While every > time the request is beening __do_request, it will release the > req->session first and choose a random one again, so this time it > may choose another random mds rank. The worst case is that it will > open all the mds sessions one by one just before it can be > successfully sent out out. > > Signed-off-by: Xiubo Li<xiubli@redhat.com> > --- > fs/ceph/mds_client.c | 20 ++++++++++++++++---- > 1 file changed, 16 insertions(+), 4 deletions(-) > > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c > index 68f3b5ed6ac8..d747e9baf9c9 100644 > --- a/fs/ceph/mds_client.c > +++ b/fs/ceph/mds_client.c > @@ -876,7 +876,8 @@ static struct inode *get_nonsnap_parent(struct dentry *dentry) > * Called under mdsc->mutex. > */ > static int __choose_mds(struct ceph_mds_client *mdsc, > - struct ceph_mds_request *req) > + struct ceph_mds_request *req, > + bool *random) > { > struct inode *inode; > struct ceph_inode_info *ci; > @@ -886,6 +887,9 @@ static int __choose_mds(struct ceph_mds_client *mdsc, > u32 hash = req->r_direct_hash; > bool is_hash = test_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags); > > + if (random) > + *random = false; > + > /* > * is there a specific mds we should try? ignore hint if we have > * no session and the mds is not up (active or recovering). > @@ -1021,6 +1025,9 @@ static int __choose_mds(struct ceph_mds_client *mdsc, > return mds; > > random: > + if (random) > + *random = true; > + > mds = ceph_mdsmap_get_random_mds(mdsc->mdsmap); > dout("choose_mds chose random mds%d\n", mds); > return mds; > @@ -2556,6 +2563,7 @@ static void __do_request(struct ceph_mds_client *mdsc, > struct ceph_mds_session *session = NULL; > int mds = -1; > int err = 0; > + bool random; > > if (req->r_err || test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) { > if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) > @@ -2596,7 +2604,7 @@ static void __do_request(struct ceph_mds_client *mdsc, > > put_request_session(req); > > - mds = __choose_mds(mdsc, req); > + mds = __choose_mds(mdsc, req, &random); > if (mds < 0 || > ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) { > dout("do_request no mds or not active, waiting for map\n"); > @@ -2624,8 +2632,12 @@ static void __do_request(struct ceph_mds_client *mdsc, > goto out_session; > } > if (session->s_state == CEPH_MDS_SESSION_NEW || > - session->s_state == CEPH_MDS_SESSION_CLOSING) > + session->s_state == CEPH_MDS_SESSION_CLOSING) { > __open_session(mdsc, session); > + /* retry the same mds later */ > + if (random) > + req->r_resend_mds = mds; > + } > list_add(&req->r_wait, &session->s_waiting); > goto out_session; > } > @@ -2890,7 +2902,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) > mutex_unlock(&mdsc->mutex); > goto out; > } else { > - int mds = __choose_mds(mdsc, req); > + int mds = __choose_mds(mdsc, req, NULL); > if (mds >= 0 && mds != req->r_session->s_mds) { > dout("but auth changed, so resending\n"); > __do_request(mdsc, req);
On Mon, 2019-12-09 at 07:47 -0500, xiubli@redhat.com wrote: > From: Xiubo Li <xiubli@redhat.com> > > With max_mds > 1 and for a request which are choosing a random > mds rank and if the relating session is not opened yet, the request > will wait the session been opened and resend again. While every > time the request is beening __do_request, it will release the > req->session first and choose a random one again, so this time it > may choose another random mds rank. The worst case is that it will > open all the mds sessions one by one just before it can be > successfully sent out out. > > Signed-off-by: Xiubo Li <xiubli@redhat.com> > --- > fs/ceph/mds_client.c | 20 ++++++++++++++++---- > 1 file changed, 16 insertions(+), 4 deletions(-) > > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c > index 68f3b5ed6ac8..d747e9baf9c9 100644 > --- a/fs/ceph/mds_client.c > +++ b/fs/ceph/mds_client.c > @@ -876,7 +876,8 @@ static struct inode *get_nonsnap_parent(struct dentry *dentry) > * Called under mdsc->mutex. > */ > static int __choose_mds(struct ceph_mds_client *mdsc, > - struct ceph_mds_request *req) > + struct ceph_mds_request *req, > + bool *random) > { > struct inode *inode; > struct ceph_inode_info *ci; > @@ -886,6 +887,9 @@ static int __choose_mds(struct ceph_mds_client *mdsc, > u32 hash = req->r_direct_hash; > bool is_hash = test_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags); > > + if (random) > + *random = false; > + > /* > * is there a specific mds we should try? ignore hint if we have > * no session and the mds is not up (active or recovering). > @@ -1021,6 +1025,9 @@ static int __choose_mds(struct ceph_mds_client *mdsc, > return mds; > > random: > + if (random) > + *random = true; > + > mds = ceph_mdsmap_get_random_mds(mdsc->mdsmap); > dout("choose_mds chose random mds%d\n", mds); > return mds; > @@ -2556,6 +2563,7 @@ static void __do_request(struct ceph_mds_client *mdsc, > struct ceph_mds_session *session = NULL; > int mds = -1; > int err = 0; > + bool random; > > if (req->r_err || test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) { > if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) > @@ -2596,7 +2604,7 @@ static void __do_request(struct ceph_mds_client *mdsc, > > put_request_session(req); > > - mds = __choose_mds(mdsc, req); > + mds = __choose_mds(mdsc, req, &random); > if (mds < 0 || > ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) { > dout("do_request no mds or not active, waiting for map\n"); > @@ -2624,8 +2632,12 @@ static void __do_request(struct ceph_mds_client *mdsc, > goto out_session; > } > if (session->s_state == CEPH_MDS_SESSION_NEW || > - session->s_state == CEPH_MDS_SESSION_CLOSING) > + session->s_state == CEPH_MDS_SESSION_CLOSING) { > __open_session(mdsc, session); > + /* retry the same mds later */ > + if (random) > + req->r_resend_mds = mds; > + } > list_add(&req->r_wait, &session->s_waiting); > goto out_session; > } > @@ -2890,7 +2902,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) > mutex_unlock(&mdsc->mutex); > goto out; > } else { > - int mds = __choose_mds(mdsc, req); > + int mds = __choose_mds(mdsc, req, NULL); > if (mds >= 0 && mds != req->r_session->s_mds) { > dout("but auth changed, so resending\n"); > __do_request(mdsc, req); Is there a tracker bug for this? This does seem like the behavior we'd want. I wish it were a little less Rube Goldberg to do this, but this code is already pretty messy so this doesn't really make it too much worse. In any case, merged with a reworded changelog.
On 2019/12/11 21:41, Jeff Layton wrote: > On Mon, 2019-12-09 at 07:47 -0500, xiubli@redhat.com wrote: >> From: Xiubo Li <xiubli@redhat.com> >> >> With max_mds > 1 and for a request which are choosing a random >> mds rank and if the relating session is not opened yet, the request >> will wait the session been opened and resend again. While every >> time the request is beening __do_request, it will release the >> req->session first and choose a random one again, so this time it >> may choose another random mds rank. The worst case is that it will >> open all the mds sessions one by one just before it can be >> successfully sent out out. >> >> Signed-off-by: Xiubo Li <xiubli@redhat.com> >> --- >> fs/ceph/mds_client.c | 20 ++++++++++++++++---- >> 1 file changed, 16 insertions(+), 4 deletions(-) >> >> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c >> index 68f3b5ed6ac8..d747e9baf9c9 100644 >> --- a/fs/ceph/mds_client.c >> +++ b/fs/ceph/mds_client.c >> @@ -876,7 +876,8 @@ static struct inode *get_nonsnap_parent(struct dentry *dentry) >> * Called under mdsc->mutex. >> */ >> static int __choose_mds(struct ceph_mds_client *mdsc, >> - struct ceph_mds_request *req) >> + struct ceph_mds_request *req, >> + bool *random) >> { >> struct inode *inode; >> struct ceph_inode_info *ci; >> @@ -886,6 +887,9 @@ static int __choose_mds(struct ceph_mds_client *mdsc, >> u32 hash = req->r_direct_hash; >> bool is_hash = test_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags); >> >> + if (random) >> + *random = false; >> + >> /* >> * is there a specific mds we should try? ignore hint if we have >> * no session and the mds is not up (active or recovering). >> @@ -1021,6 +1025,9 @@ static int __choose_mds(struct ceph_mds_client *mdsc, >> return mds; >> >> random: >> + if (random) >> + *random = true; >> + >> mds = ceph_mdsmap_get_random_mds(mdsc->mdsmap); >> dout("choose_mds chose random mds%d\n", mds); >> return mds; >> @@ -2556,6 +2563,7 @@ static void __do_request(struct ceph_mds_client *mdsc, >> struct ceph_mds_session *session = NULL; >> int mds = -1; >> int err = 0; >> + bool random; >> >> if (req->r_err || test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) { >> if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) >> @@ -2596,7 +2604,7 @@ static void __do_request(struct ceph_mds_client *mdsc, >> >> put_request_session(req); >> >> - mds = __choose_mds(mdsc, req); >> + mds = __choose_mds(mdsc, req, &random); >> if (mds < 0 || >> ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) { >> dout("do_request no mds or not active, waiting for map\n"); >> @@ -2624,8 +2632,12 @@ static void __do_request(struct ceph_mds_client *mdsc, >> goto out_session; >> } >> if (session->s_state == CEPH_MDS_SESSION_NEW || >> - session->s_state == CEPH_MDS_SESSION_CLOSING) >> + session->s_state == CEPH_MDS_SESSION_CLOSING) { >> __open_session(mdsc, session); >> + /* retry the same mds later */ >> + if (random) >> + req->r_resend_mds = mds; >> + } >> list_add(&req->r_wait, &session->s_waiting); >> goto out_session; >> } >> @@ -2890,7 +2902,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) >> mutex_unlock(&mdsc->mutex); >> goto out; >> } else { >> - int mds = __choose_mds(mdsc, req); >> + int mds = __choose_mds(mdsc, req, NULL); >> if (mds >= 0 && mds != req->r_session->s_mds) { >> dout("but auth changed, so resending\n"); >> __do_request(mdsc, req); > Is there a tracker bug for this? I created one just now: https://tracker.ceph.com/issues/43270 > > This does seem like the behavior we'd want. I wish it were a little less > Rube Goldberg to do this, but this code is already pretty messy so this > doesn't really make it too much worse. > > In any case, merged with a reworded changelog. Yeah, that's look good. Someone in the community hit a long time waiting issue when mounting in the fuse client, I checked that logs and it seems a similar issue, but not very sure yet. Thanks.
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 68f3b5ed6ac8..d747e9baf9c9 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -876,7 +876,8 @@ static struct inode *get_nonsnap_parent(struct dentry *dentry) * Called under mdsc->mutex. */ static int __choose_mds(struct ceph_mds_client *mdsc, - struct ceph_mds_request *req) + struct ceph_mds_request *req, + bool *random) { struct inode *inode; struct ceph_inode_info *ci; @@ -886,6 +887,9 @@ static int __choose_mds(struct ceph_mds_client *mdsc, u32 hash = req->r_direct_hash; bool is_hash = test_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags); + if (random) + *random = false; + /* * is there a specific mds we should try? ignore hint if we have * no session and the mds is not up (active or recovering). @@ -1021,6 +1025,9 @@ static int __choose_mds(struct ceph_mds_client *mdsc, return mds; random: + if (random) + *random = true; + mds = ceph_mdsmap_get_random_mds(mdsc->mdsmap); dout("choose_mds chose random mds%d\n", mds); return mds; @@ -2556,6 +2563,7 @@ static void __do_request(struct ceph_mds_client *mdsc, struct ceph_mds_session *session = NULL; int mds = -1; int err = 0; + bool random; if (req->r_err || test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) { if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) @@ -2596,7 +2604,7 @@ static void __do_request(struct ceph_mds_client *mdsc, put_request_session(req); - mds = __choose_mds(mdsc, req); + mds = __choose_mds(mdsc, req, &random); if (mds < 0 || ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) { dout("do_request no mds or not active, waiting for map\n"); @@ -2624,8 +2632,12 @@ static void __do_request(struct ceph_mds_client *mdsc, goto out_session; } if (session->s_state == CEPH_MDS_SESSION_NEW || - session->s_state == CEPH_MDS_SESSION_CLOSING) + session->s_state == CEPH_MDS_SESSION_CLOSING) { __open_session(mdsc, session); + /* retry the same mds later */ + if (random) + req->r_resend_mds = mds; + } list_add(&req->r_wait, &session->s_waiting); goto out_session; } @@ -2890,7 +2902,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) mutex_unlock(&mdsc->mutex); goto out; } else { - int mds = __choose_mds(mdsc, req); + int mds = __choose_mds(mdsc, req, NULL); if (mds >= 0 && mds != req->r_session->s_mds) { dout("but auth changed, so resending\n"); __do_request(mdsc, req);