diff mbox series

ceph: retry the same mds later after the new session is opened

Message ID 20191209124715.2255-1-xiubli@redhat.com (mailing list archive)
State New, archived
Headers show
Series ceph: retry the same mds later after the new session is opened | expand

Commit Message

Xiubo Li Dec. 9, 2019, 12:47 p.m. UTC
From: Xiubo Li <xiubli@redhat.com>

With max_mds > 1 and for a request which are choosing a random
mds rank and if the relating session is not opened yet, the request
will wait the session been opened and resend again. While every
time the request is beening __do_request, it will release the
req->session first and choose a random one again, so this time it
may choose another random mds rank. The worst case is that it will
open all the mds sessions one by one just before it can be
successfully sent out out.

Signed-off-by: Xiubo Li <xiubli@redhat.com>
---
 fs/ceph/mds_client.c | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

Comments

Xiubo Li Dec. 9, 2019, 2:42 p.m. UTC | #1
This can be seen almost every time when mounting:


  92959 <7>[13220.815822] ceph: mount start 00000000e3b1070a
  92960 <6>[13220.817349] libceph: mon2 (1)192.168.195.165:40566 session 
established
  92961 <7>[13220.818639] ceph:  handle_map epoch 58 len 889
  92962 <7>[13220.818644] ceph:  mdsmap_decode 1/3 4216 mds0.5 
(1)192.168.195.165:6813 up:active            ==> max_mds == 3
  92963 <7>[13220.818646] ceph:  mdsmap_decode 2/3 4339 mds1.55 
(1)192.168.195.165:6815 up:active
  92964 <7>[13220.818646] ceph:  mdsmap_decode 3/3 4340 mds2.57 
(1)192.168.195.165:6814 up:active
  92965 <7>[13220.818648] ceph:  mdsmap_decode m_enabled: 1, m_damaged: 
0, m_num_laggy: 0
  92966 <7>[13220.818648] ceph:  mdsmap_decode success epoch 58
  92967 <7>[13220.818649] ceph:  check_new_map new 58 old 0
  92968 <6>[13220.820577] libceph: client5819 fsid 
6c39c42b-5888-4c6a-ba32-89acfa5a2353
  92969 <7>[13220.821827] ceph:  mount opening path \t
  92970 <7>[13220.821887] ceph:  open_root_inode opening ''
  92971 <7>[13220.821892] ceph:  do_request on 000000005666e7bf
  92972 <7>[13220.821893] ceph:  submit_request on 000000005666e7bf for 
inode 0000000057da6992
  92973 <7>[13220.821896] ceph:  __register_request 000000005666e7bf tid 1
  92974 <7>[13220.821898] ceph:  __choose_mds 0000000057da6992 is_hash=0 
(0) mode 0
  92975 <7>[13220.821899] ceph:  choose_mds chose random 
mds0             ====================> random mds0
  92976 <7>[13220.821901] ceph:  register_session: realloc to 1
  92977 <7>[13220.821902] ceph:  register_session: mds0
  92978 <7>[13220.821905] ceph:  mdsc get_session 000000001649fdbd 2 -> 3
  92979 <7>[13220.821905] ceph:  mdsc con_get 000000001649fdbd ok (3)
  92980 <7>[13220.821910] ceph:  mdsc get_session 000000001649fdbd 3 -> 4
  92981 <7>[13220.821912] ceph:  do_request mds0 session 
000000001649fdbd state new
  92982 <7>[13220.821913] ceph:  open_session to mds0 (up:active)
  92983 [...]
  92984 <7>[13220.822167] ceph:  do_request waiting
  92985 [...]
  92986 <7>[13220.833112] ceph:  handle_session mds0 open 
000000001649fdbd state opening seq 0
  92987 <7>[13220.833113] ceph:  renewed_caps mds0 ttl now 4307945924, 
was fresh, now stale
  92988 <7>[13220.833114] ceph:   wake request 000000005666e7bf tid 1
  92989 <7>[13220.833116] ceph:  mdsc put_session 000000001649fdbd 4 -> 3
  92990 <7>[13220.833117] ceph:  __choose_mds 0000000057da6992 is_hash=0 
(0) mode 0
  92991 <7>[13220.833118] ceph:  choose_mds chose random mds1   
========================> random mds1
  92992 <7>[13220.833119] ceph:  register_session: realloc to 2
  92993 <7>[13220.833121] ceph:  register_session: mds1
  92994 <7>[13220.833122] ceph:  mdsc get_session 00000000534bd3ef 2 -> 3
  92995 <7>[13220.833123] ceph:  mdsc con_get 00000000534bd3ef ok (3)
  92996 <7>[13220.833124] ceph:  mdsc get_session 00000000534bd3ef 3 -> 4
  92997 <7>[13220.833124] ceph:  do_request mds1 session 
00000000534bd3ef state new
  92998 <7>[13220.833126] ceph:  open_session to mds1 (up:active)
  92999 [...]
  93000 <7>[13220.845883] ceph:  handle_session mds1 open 
00000000534bd3ef state opening seq 0
  93001 <7>[13220.845884] ceph:  renewed_caps mds1 ttl now 4307945935, 
was fresh, now stale
  93002 <7>[13220.845885] ceph:   wake request 000000005666e7bf tid 1
  93003 <7>[13220.845887] ceph:  mdsc put_session 00000000534bd3ef 4 -> 3
  93004 <7>[13220.845888] ceph:  __choose_mds 0000000057da6992 is_hash=0 
(0) mode 0
  93005 <7>[13220.845889] ceph:  choose_mds chose random mds2         
================> random mds2
  93006 <7>[13220.845890] ceph:  register_session: realloc to 4
  93007 <7>[13220.845891] ceph:  register_session: mds2
  93008 <7>[13220.845892] ceph:  mdsc get_session 0000000076259af8 2 -> 3
  93009 <7>[13220.845892] ceph:  mdsc con_get 0000000076259af8 ok (3)
  93010 <7>[13220.845893] ceph:  mdsc get_session 0000000076259af8 3 -> 4
  93011 <7>[13220.845893] ceph:  do_request mds2 session 
0000000076259af8 state new
  93012 <7>[13220.845894] ceph:  open_session to mds2 (up:active)
   [...]
  93014 <7>[13220.852986] ceph:  handle_session mds2 open 
0000000076259af8 state opening seq 0
  93015 <7>[13220.852987] ceph:  renewed_caps mds2 ttl now 4307945948, 
was fresh, now stale
  93016 <7>[13220.852988] ceph:   wake request 000000005666e7bf tid 1
  93017 <7>[13220.852990] ceph:  mdsc put_session 0000000076259af8 4 -> 3
  93018 <7>[13220.852991] ceph:  __choose_mds 0000000057da6992 is_hash=0 
(0) mode 0
  93019 <7>[13220.852992] ceph:  choose_mds chose random mds0 
                          ===============> choose random mds0 again
  93020 <7>[13220.852993] ceph:  mdsc get_session 0000000076259af8 3 -> 4
  93021 <7>[13220.852994] ceph:  mdsc get_session 0000000076259af8 4 -> 5
  93022 <7>[13220.852994] ceph:  do_request mds0 session 
0000000076259af8 state open
  93023 <7>[13220.852996] ceph:  prepare_send_request 000000005666e7bf 
tid 1 getattr (attempt 1)
  93024 <7>[13220.852997] ceph:   path
  93025 <7>[13220.852998] ceph: r_parent = 0000000057da6992
  93026 [...]
  93027 <7>[13220.853414] ceph:  handle_reply 000000005666e7bf
  93028 <7>[13220.853415] ceph:  __unregister_request 000000005666e7bf tid 1
  93029 <7>[13220.853416] ceph:  got safe reply 1, mds0, last request: Yes
  93030 <7>[13220.853417] ceph:  handle_reply tid 1 result 0
  93031 <7>[13220.853421] ceph:  fill_trace 000000005666e7bf is_dentry 0 
is_target 1
  93032 <7>[13220.853424] ceph:  alloc_inode 00000000a823bfed
  93033 <7>[13220.853427] ceph:  get_inode created new inode 
00000000a823bfed 1.fffffffffffffffe ino 1
  93034 <7>[13220.853428] ceph:  get_inode on 1=1.fffffffffffffffe got 
00000000a823bfed
  93035 <7>[13220.853429] ceph:  fill_inode 00000000a823bfed ino 
1.fffffffffffffffe v 81286 had 0
  93036 <7>[13220.853431] ceph:  00000000a823bfed mode 040755 uid.gid 0.0
  93037 <7>[13220.853433] ceph:  truncate_size 0 -> 18446744073709551615
  93038 <7>[13220.853435] ceph:  fill_trace done err=0
  93039 <7>[13220.853441] ceph:  mdsc con_put 0000000076259af8 (4)
  93040 <7>[13220.853442] ceph:  mdsc put_session 0000000076259af8 5 -> 4
  93041 <7>[13220.853446] ceph:  do_request waited, got 0
  93042 <7>[13220.853447] ceph:  do_request 000000005666e7bf done, result 0
  93043 <7>[13220.853447] ceph:  open_root_inode success
  93044 <7>[13220.853452] ceph:  open_root_inode success, root dentry is 
0000000037517481
[...]
  93050 <7>[13220.853457] ceph:  mount success


On 2019/12/9 20:47, xiubli@redhat.com wrote:
> From: Xiubo Li<xiubli@redhat.com>
>
> With max_mds > 1 and for a request which are choosing a random
> mds rank and if the relating session is not opened yet, the request
> will wait the session been opened and resend again. While every
> time the request is beening __do_request, it will release the
> req->session first and choose a random one again, so this time it
> may choose another random mds rank. The worst case is that it will
> open all the mds sessions one by one just before it can be
> successfully sent out out.
>
> Signed-off-by: Xiubo Li<xiubli@redhat.com>
> ---
>   fs/ceph/mds_client.c | 20 ++++++++++++++++----
>   1 file changed, 16 insertions(+), 4 deletions(-)
>
> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> index 68f3b5ed6ac8..d747e9baf9c9 100644
> --- a/fs/ceph/mds_client.c
> +++ b/fs/ceph/mds_client.c
> @@ -876,7 +876,8 @@ static struct inode *get_nonsnap_parent(struct dentry *dentry)
>    * Called under mdsc->mutex.
>    */
>   static int __choose_mds(struct ceph_mds_client *mdsc,
> -			struct ceph_mds_request *req)
> +			struct ceph_mds_request *req,
> +			bool *random)
>   {
>   	struct inode *inode;
>   	struct ceph_inode_info *ci;
> @@ -886,6 +887,9 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
>   	u32 hash = req->r_direct_hash;
>   	bool is_hash = test_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags);
>   
> +	if (random)
> +		*random = false;
> +
>   	/*
>   	 * is there a specific mds we should try?  ignore hint if we have
>   	 * no session and the mds is not up (active or recovering).
> @@ -1021,6 +1025,9 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
>   	return mds;
>   
>   random:
> +	if (random)
> +		*random = true;
> +
>   	mds = ceph_mdsmap_get_random_mds(mdsc->mdsmap);
>   	dout("choose_mds chose random mds%d\n", mds);
>   	return mds;
> @@ -2556,6 +2563,7 @@ static void __do_request(struct ceph_mds_client *mdsc,
>   	struct ceph_mds_session *session = NULL;
>   	int mds = -1;
>   	int err = 0;
> +	bool random;
>   
>   	if (req->r_err || test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) {
>   		if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags))
> @@ -2596,7 +2604,7 @@ static void __do_request(struct ceph_mds_client *mdsc,
>   
>   	put_request_session(req);
>   
> -	mds = __choose_mds(mdsc, req);
> +	mds = __choose_mds(mdsc, req, &random);
>   	if (mds < 0 ||
>   	    ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) {
>   		dout("do_request no mds or not active, waiting for map\n");
> @@ -2624,8 +2632,12 @@ static void __do_request(struct ceph_mds_client *mdsc,
>   			goto out_session;
>   		}
>   		if (session->s_state == CEPH_MDS_SESSION_NEW ||
> -		    session->s_state == CEPH_MDS_SESSION_CLOSING)
> +		    session->s_state == CEPH_MDS_SESSION_CLOSING) {
>   			__open_session(mdsc, session);
> +			/* retry the same mds later */
> +			if (random)
> +				req->r_resend_mds = mds;
> +		}
>   		list_add(&req->r_wait, &session->s_waiting);
>   		goto out_session;
>   	}
> @@ -2890,7 +2902,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
>   			mutex_unlock(&mdsc->mutex);
>   			goto out;
>   		} else  {
> -			int mds = __choose_mds(mdsc, req);
> +			int mds = __choose_mds(mdsc, req, NULL);
>   			if (mds >= 0 && mds != req->r_session->s_mds) {
>   				dout("but auth changed, so resending\n");
>   				__do_request(mdsc, req);
Jeff Layton Dec. 11, 2019, 1:41 p.m. UTC | #2
On Mon, 2019-12-09 at 07:47 -0500, xiubli@redhat.com wrote:
> From: Xiubo Li <xiubli@redhat.com>
> 
> With max_mds > 1 and for a request which are choosing a random
> mds rank and if the relating session is not opened yet, the request
> will wait the session been opened and resend again. While every
> time the request is beening __do_request, it will release the
> req->session first and choose a random one again, so this time it
> may choose another random mds rank. The worst case is that it will
> open all the mds sessions one by one just before it can be
> successfully sent out out.
> 
> Signed-off-by: Xiubo Li <xiubli@redhat.com>
> ---
>  fs/ceph/mds_client.c | 20 ++++++++++++++++----
>  1 file changed, 16 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> index 68f3b5ed6ac8..d747e9baf9c9 100644
> --- a/fs/ceph/mds_client.c
> +++ b/fs/ceph/mds_client.c
> @@ -876,7 +876,8 @@ static struct inode *get_nonsnap_parent(struct dentry *dentry)
>   * Called under mdsc->mutex.
>   */
>  static int __choose_mds(struct ceph_mds_client *mdsc,
> -			struct ceph_mds_request *req)
> +			struct ceph_mds_request *req,
> +			bool *random)
>  {
>  	struct inode *inode;
>  	struct ceph_inode_info *ci;
> @@ -886,6 +887,9 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
>  	u32 hash = req->r_direct_hash;
>  	bool is_hash = test_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags);
>  
> +	if (random)
> +		*random = false;
> +
>  	/*
>  	 * is there a specific mds we should try?  ignore hint if we have
>  	 * no session and the mds is not up (active or recovering).
> @@ -1021,6 +1025,9 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
>  	return mds;
>  
>  random:
> +	if (random)
> +		*random = true;
> +
>  	mds = ceph_mdsmap_get_random_mds(mdsc->mdsmap);
>  	dout("choose_mds chose random mds%d\n", mds);
>  	return mds;
> @@ -2556,6 +2563,7 @@ static void __do_request(struct ceph_mds_client *mdsc,
>  	struct ceph_mds_session *session = NULL;
>  	int mds = -1;
>  	int err = 0;
> +	bool random;
>  
>  	if (req->r_err || test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) {
>  		if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags))
> @@ -2596,7 +2604,7 @@ static void __do_request(struct ceph_mds_client *mdsc,
>  
>  	put_request_session(req);
>  
> -	mds = __choose_mds(mdsc, req);
> +	mds = __choose_mds(mdsc, req, &random);
>  	if (mds < 0 ||
>  	    ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) {
>  		dout("do_request no mds or not active, waiting for map\n");
> @@ -2624,8 +2632,12 @@ static void __do_request(struct ceph_mds_client *mdsc,
>  			goto out_session;
>  		}
>  		if (session->s_state == CEPH_MDS_SESSION_NEW ||
> -		    session->s_state == CEPH_MDS_SESSION_CLOSING)
> +		    session->s_state == CEPH_MDS_SESSION_CLOSING) {
>  			__open_session(mdsc, session);
> +			/* retry the same mds later */
> +			if (random)
> +				req->r_resend_mds = mds;
> +		}
>  		list_add(&req->r_wait, &session->s_waiting);
>  		goto out_session;
>  	}
> @@ -2890,7 +2902,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
>  			mutex_unlock(&mdsc->mutex);
>  			goto out;
>  		} else  {
> -			int mds = __choose_mds(mdsc, req);
> +			int mds = __choose_mds(mdsc, req, NULL);
>  			if (mds >= 0 && mds != req->r_session->s_mds) {
>  				dout("but auth changed, so resending\n");
>  				__do_request(mdsc, req);

Is there a tracker bug for this?

This does seem like the behavior we'd want. I wish it were a little less
Rube Goldberg to do this, but this code is already pretty messy so this
doesn't really make it too much worse.

In any case, merged with a reworded changelog.
Xiubo Li Dec. 12, 2019, 12:13 a.m. UTC | #3
On 2019/12/11 21:41, Jeff Layton wrote:
> On Mon, 2019-12-09 at 07:47 -0500, xiubli@redhat.com wrote:
>> From: Xiubo Li <xiubli@redhat.com>
>>
>> With max_mds > 1 and for a request which are choosing a random
>> mds rank and if the relating session is not opened yet, the request
>> will wait the session been opened and resend again. While every
>> time the request is beening __do_request, it will release the
>> req->session first and choose a random one again, so this time it
>> may choose another random mds rank. The worst case is that it will
>> open all the mds sessions one by one just before it can be
>> successfully sent out out.
>>
>> Signed-off-by: Xiubo Li <xiubli@redhat.com>
>> ---
>>   fs/ceph/mds_client.c | 20 ++++++++++++++++----
>>   1 file changed, 16 insertions(+), 4 deletions(-)
>>
>> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
>> index 68f3b5ed6ac8..d747e9baf9c9 100644
>> --- a/fs/ceph/mds_client.c
>> +++ b/fs/ceph/mds_client.c
>> @@ -876,7 +876,8 @@ static struct inode *get_nonsnap_parent(struct dentry *dentry)
>>    * Called under mdsc->mutex.
>>    */
>>   static int __choose_mds(struct ceph_mds_client *mdsc,
>> -			struct ceph_mds_request *req)
>> +			struct ceph_mds_request *req,
>> +			bool *random)
>>   {
>>   	struct inode *inode;
>>   	struct ceph_inode_info *ci;
>> @@ -886,6 +887,9 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
>>   	u32 hash = req->r_direct_hash;
>>   	bool is_hash = test_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags);
>>   
>> +	if (random)
>> +		*random = false;
>> +
>>   	/*
>>   	 * is there a specific mds we should try?  ignore hint if we have
>>   	 * no session and the mds is not up (active or recovering).
>> @@ -1021,6 +1025,9 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
>>   	return mds;
>>   
>>   random:
>> +	if (random)
>> +		*random = true;
>> +
>>   	mds = ceph_mdsmap_get_random_mds(mdsc->mdsmap);
>>   	dout("choose_mds chose random mds%d\n", mds);
>>   	return mds;
>> @@ -2556,6 +2563,7 @@ static void __do_request(struct ceph_mds_client *mdsc,
>>   	struct ceph_mds_session *session = NULL;
>>   	int mds = -1;
>>   	int err = 0;
>> +	bool random;
>>   
>>   	if (req->r_err || test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) {
>>   		if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags))
>> @@ -2596,7 +2604,7 @@ static void __do_request(struct ceph_mds_client *mdsc,
>>   
>>   	put_request_session(req);
>>   
>> -	mds = __choose_mds(mdsc, req);
>> +	mds = __choose_mds(mdsc, req, &random);
>>   	if (mds < 0 ||
>>   	    ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) {
>>   		dout("do_request no mds or not active, waiting for map\n");
>> @@ -2624,8 +2632,12 @@ static void __do_request(struct ceph_mds_client *mdsc,
>>   			goto out_session;
>>   		}
>>   		if (session->s_state == CEPH_MDS_SESSION_NEW ||
>> -		    session->s_state == CEPH_MDS_SESSION_CLOSING)
>> +		    session->s_state == CEPH_MDS_SESSION_CLOSING) {
>>   			__open_session(mdsc, session);
>> +			/* retry the same mds later */
>> +			if (random)
>> +				req->r_resend_mds = mds;
>> +		}
>>   		list_add(&req->r_wait, &session->s_waiting);
>>   		goto out_session;
>>   	}
>> @@ -2890,7 +2902,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
>>   			mutex_unlock(&mdsc->mutex);
>>   			goto out;
>>   		} else  {
>> -			int mds = __choose_mds(mdsc, req);
>> +			int mds = __choose_mds(mdsc, req, NULL);
>>   			if (mds >= 0 && mds != req->r_session->s_mds) {
>>   				dout("but auth changed, so resending\n");
>>   				__do_request(mdsc, req);
> Is there a tracker bug for this?

I created one just now: https://tracker.ceph.com/issues/43270


>
> This does seem like the behavior we'd want. I wish it were a little less
> Rube Goldberg to do this, but this code is already pretty messy so this
> doesn't really make it too much worse.
>
> In any case, merged with a reworded changelog.

Yeah, that's look good.

Someone in the community hit a long time waiting issue when mounting in 
the fuse client, I checked that logs and it seems a similar issue, but 
not very sure yet.

Thanks.
diff mbox series

Patch

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 68f3b5ed6ac8..d747e9baf9c9 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -876,7 +876,8 @@  static struct inode *get_nonsnap_parent(struct dentry *dentry)
  * Called under mdsc->mutex.
  */
 static int __choose_mds(struct ceph_mds_client *mdsc,
-			struct ceph_mds_request *req)
+			struct ceph_mds_request *req,
+			bool *random)
 {
 	struct inode *inode;
 	struct ceph_inode_info *ci;
@@ -886,6 +887,9 @@  static int __choose_mds(struct ceph_mds_client *mdsc,
 	u32 hash = req->r_direct_hash;
 	bool is_hash = test_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags);
 
+	if (random)
+		*random = false;
+
 	/*
 	 * is there a specific mds we should try?  ignore hint if we have
 	 * no session and the mds is not up (active or recovering).
@@ -1021,6 +1025,9 @@  static int __choose_mds(struct ceph_mds_client *mdsc,
 	return mds;
 
 random:
+	if (random)
+		*random = true;
+
 	mds = ceph_mdsmap_get_random_mds(mdsc->mdsmap);
 	dout("choose_mds chose random mds%d\n", mds);
 	return mds;
@@ -2556,6 +2563,7 @@  static void __do_request(struct ceph_mds_client *mdsc,
 	struct ceph_mds_session *session = NULL;
 	int mds = -1;
 	int err = 0;
+	bool random;
 
 	if (req->r_err || test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) {
 		if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags))
@@ -2596,7 +2604,7 @@  static void __do_request(struct ceph_mds_client *mdsc,
 
 	put_request_session(req);
 
-	mds = __choose_mds(mdsc, req);
+	mds = __choose_mds(mdsc, req, &random);
 	if (mds < 0 ||
 	    ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) {
 		dout("do_request no mds or not active, waiting for map\n");
@@ -2624,8 +2632,12 @@  static void __do_request(struct ceph_mds_client *mdsc,
 			goto out_session;
 		}
 		if (session->s_state == CEPH_MDS_SESSION_NEW ||
-		    session->s_state == CEPH_MDS_SESSION_CLOSING)
+		    session->s_state == CEPH_MDS_SESSION_CLOSING) {
 			__open_session(mdsc, session);
+			/* retry the same mds later */
+			if (random)
+				req->r_resend_mds = mds;
+		}
 		list_add(&req->r_wait, &session->s_waiting);
 		goto out_session;
 	}
@@ -2890,7 +2902,7 @@  static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
 			mutex_unlock(&mdsc->mutex);
 			goto out;
 		} else  {
-			int mds = __choose_mds(mdsc, req);
+			int mds = __choose_mds(mdsc, req, NULL);
 			if (mds >= 0 && mds != req->r_session->s_mds) {
 				dout("but auth changed, so resending\n");
 				__do_request(mdsc, req);