diff mbox

[20/39] mds: include replica nonce in MMDSCacheRejoin::inode_strong

Message ID 1363531902-24909-21-git-send-email-zheng.z.yan@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Yan, Zheng March 17, 2013, 2:51 p.m. UTC
From: "Yan, Zheng" <zheng.z.yan@intel.com>

So the recovering MDS can properly handle cache expire messages.
Also increase the nonce value when sending the cache rejoin acks.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
---
 src/mds/MDCache.cc             | 35 +++++++++++++++++++++++------------
 src/messages/MMDSCacheRejoin.h | 11 +++++++----
 2 files changed, 30 insertions(+), 16 deletions(-)

Comments

Gregory Farnum March 20, 2013, 11:26 p.m. UTC | #1
On Sun, Mar 17, 2013 at 7:51 AM, Yan, Zheng <zheng.z.yan@intel.com> wrote:
> From: "Yan, Zheng" <zheng.z.yan@intel.com>
>
> So the recovering MDS can properly handle cache expire messages.
> Also increase the nonce value when sending the cache rejoin acks.
>
> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
> ---
>  src/mds/MDCache.cc             | 35 +++++++++++++++++++++++------------
>  src/messages/MMDSCacheRejoin.h | 11 +++++++----
>  2 files changed, 30 insertions(+), 16 deletions(-)
>
> diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
> index 008a8a2..8ba676e 100644
> --- a/src/mds/MDCache.cc
> +++ b/src/mds/MDCache.cc
> @@ -3538,6 +3538,7 @@ void MDCache::rejoin_send_rejoins()
>        if (p->first == 0 && root) {
>         p->second->add_weak_inode(root->vino());
>         p->second->add_strong_inode(root->vino(),
> +                                   root->get_replica_nonce(),
>                                     root->get_caps_wanted(),
>                                     root->filelock.get_state(),
>                                     root->nestlock.get_state(),
> @@ -3551,6 +3552,7 @@ void MDCache::rejoin_send_rejoins()
>        if (CInode *in = get_inode(MDS_INO_MDSDIR(p->first))) {
>         p->second->add_weak_inode(in->vino());
>         p->second->add_strong_inode(in->vino(),
> +                                   in->get_replica_nonce(),
>                                     in->get_caps_wanted(),
>                                     in->filelock.get_state(),
>                                     in->nestlock.get_state(),
> @@ -3709,6 +3711,7 @@ void MDCache::rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin)
>         CInode *in = dnl->get_inode();
>         dout(15) << " add_strong_inode " << *in << dendl;
>         rejoin->add_strong_inode(in->vino(),
> +                                in->get_replica_nonce(),
>                                  in->get_caps_wanted(),
>                                  in->filelock.get_state(),
>                                  in->nestlock.get_state(),
> @@ -4248,7 +4251,7 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
>         dir = rejoin_invent_dirfrag(p->first);
>      }
>      if (dir) {
> -      dir->add_replica(from);
> +      dir->add_replica(from, p->second.nonce);
>        dir->dir_rep = p->second.dir_rep;
>      } else {
>        dout(10) << " frag " << p->first << " doesn't match dirfragtree " << *diri << dendl;
> @@ -4263,7 +4266,7 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
>           dir = rejoin_invent_dirfrag(p->first);
>         else
>           dout(10) << " have(approx) " << *dir << dendl;
> -       dir->add_replica(from);
> +       dir->add_replica(from, p->second.nonce);
>         dir->dir_rep = p->second.dir_rep;
>        }
>        refragged = true;
> @@ -4327,7 +4330,7 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
>         mdr->locks.insert(&dn->lock);
>        }
>
> -      dn->add_replica(from);
> +      dn->add_replica(from, q->second.nonce);
>        dout(10) << " have " << *dn << dendl;
>
>        // inode?
> @@ -4412,7 +4415,7 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
>           dout(10) << " sender has dentry but not inode, adding them as a replica" << dendl;
>         }
>
> -       in->add_replica(from);
> +       in->add_replica(from, p->second.nonce);
>         dout(10) << " have " << *in << dendl;
>        }
>      }
> @@ -5176,7 +5179,7 @@ void MDCache::rejoin_send_acks()
>        for (map<int,int>::iterator r = dir->replicas_begin();
>            r != dir->replicas_end();
>            ++r)
> -       ack[r->first]->add_strong_dirfrag(dir->dirfrag(), r->second, dir->dir_rep);
> +       ack[r->first]->add_strong_dirfrag(dir->dirfrag(), ++r->second, dir->dir_rep);
>
>        for (CDir::map_t::iterator q = dir->items.begin();
>            q != dir->items.end();
> @@ -5192,7 +5195,7 @@ void MDCache::rejoin_send_acks()
>                                            dnl->is_primary() ? dnl->get_inode()->ino():inodeno_t(0),
>                                            dnl->is_remote() ? dnl->get_remote_ino():inodeno_t(0),
>                                            dnl->is_remote() ? dnl->get_remote_d_type():0,
> -                                          r->second,
> +                                          ++r->second,
>                                            dn->lock.get_replica_state());
>
>         if (!dnl->is_primary())
> @@ -5205,7 +5208,7 @@ void MDCache::rejoin_send_acks()
>              r != in->replicas_end();
>              ++r) {
>           ack[r->first]->add_inode_base(in);
> -         ack[r->first]->add_inode_locks(in, r->second);
> +         ack[r->first]->add_inode_locks(in, ++r->second);
>         }
>
>         // subdirs in this subtree?
> @@ -5220,14 +5223,14 @@ void MDCache::rejoin_send_acks()
>          r != root->replicas_end();
>          ++r) {
>        ack[r->first]->add_inode_base(root);
> -      ack[r->first]->add_inode_locks(root, r->second);
> +      ack[r->first]->add_inode_locks(root, ++r->second);
>      }
>    if (myin)
>      for (map<int,int>::iterator r = myin->replicas_begin();
>          r != myin->replicas_end();
>          ++r) {
>        ack[r->first]->add_inode_base(myin);
> -      ack[r->first]->add_inode_locks(myin, r->second);
> +      ack[r->first]->add_inode_locks(myin, ++r->second);
>      }
>
>    // include inode base for any inodes whose scatterlocks may have updated
> @@ -5728,6 +5731,12 @@ void MDCache::send_expire_messages(map<int, MCacheExpire*>& expiremap)
>    for (map<int, MCacheExpire*>::iterator it = expiremap.begin();
>         it != expiremap.end();
>         ++it) {
> +    if (mds->mdsmap->get_state(it->first) < MDSMap::STATE_REJOIN ||
> +       (mds->mdsmap->get_state(it->first) == MDSMap::STATE_REJOIN &&
> +        rejoin_sent.count(it->first) == 0)) {
> +      it->second->put();
> +      continue;
> +    }
>      dout(7) << "sending cache_expire to " << it->first << dendl;
>      mds->send_message_mds(it->second, it->first);
>    }
> @@ -9640,9 +9649,11 @@ void MDCache::handle_dentry_link(MDentryLink *m)
>      CInode *in = add_replica_inode(p, NULL, finished);
>      assert(in->get_num_ref() == 0);
>      assert(in->get_parent_dn() == NULL);
> -    MCacheExpire* expire = new MCacheExpire(mds->get_nodeid());
> -    expire->add_inode(m->get_subtree(), in->vino(), in->get_replica_nonce());
> -    mds->send_message_mds(expire, m->get_source().num());
> +    map<int, MCacheExpire*> expiremap;
> +    int from = m->get_source().num();
> +    expiremap[from] = new MCacheExpire(mds->get_nodeid());
> +    expiremap[from]->add_inode(m->get_subtree(), in->vino(), in->get_replica_nonce());
> +    send_expire_messages(expiremap);
>      remove_inode(in);
>    }
>
> diff --git a/src/messages/MMDSCacheRejoin.h b/src/messages/MMDSCacheRejoin.h
> index 825400d..b88f551 100644
> --- a/src/messages/MMDSCacheRejoin.h
> +++ b/src/messages/MMDSCacheRejoin.h
> @@ -43,19 +43,22 @@ class MMDSCacheRejoin : public Message {
>
>    // -- types --
>    struct inode_strong {
> +    int32_t nonce;
>      int32_t caps_wanted;
>      int32_t filelock, nestlock, dftlock;
>      inode_strong() {}
> -    inode_strong(int cw, int dl, int nl, int dftl) :
> -      caps_wanted(cw),
> +    inode_strong(int n, int cw, int dl, int nl, int dftl) :
> +      nonce(n), caps_wanted(cw),
>        filelock(dl), nestlock(nl), dftlock(dftl) { }
>      void encode(bufferlist &bl) const {
> +      ::encode(nonce, bl);
>        ::encode(caps_wanted, bl);
>        ::encode(filelock, bl);
>        ::encode(nestlock, bl);
>        ::encode(dftlock, bl);
>      }
>      void decode(bufferlist::iterator &bl) {
> +      ::decode(nonce, bl);
>        ::decode(caps_wanted, bl);
>        ::decode(filelock, bl);
>        ::decode(nestlock, bl);

This is a wire format change without any versioning to cover it —
we're going to need to at a minimum add feature bits to cover this. It
might be more appropriate to introduce proper versioning at the same
time, though. You should find examples of everything you need in my
recent encoding changes.

The rest looks good.
-Greg

> @@ -208,8 +211,8 @@ public:
>    void add_weak_inode(vinodeno_t i) {
>      weak_inodes.insert(i);
>    }
> -  void add_strong_inode(vinodeno_t i, int cw, int dl, int nl, int dftl) {
> -    strong_inodes[i] = inode_strong(cw, dl, nl, dftl);
> +  void add_strong_inode(vinodeno_t i, int n, int cw, int dl, int nl, int dftl) {
> +    strong_inodes[i] = inode_strong(n, cw, dl, nl, dftl);
>    }
>    void add_inode_locks(CInode *in, __u32 nonce) {
>      ::encode(in->inode.ino, inode_locks);
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sage Weil March 20, 2013, 11:36 p.m. UTC | #2
On Wed, 20 Mar 2013, Gregory Farnum wrote:
> On Sun, Mar 17, 2013 at 7:51 AM, Yan, Zheng <zheng.z.yan@intel.com> wrote:
> > From: "Yan, Zheng" <zheng.z.yan@intel.com>
> >
> > So the recovering MDS can properly handle cache expire messages.
> > Also increase the nonce value when sending the cache rejoin acks.
> >
> > Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
> > ---
> >  src/mds/MDCache.cc             | 35 +++++++++++++++++++++++------------
> >  src/messages/MMDSCacheRejoin.h | 11 +++++++----
> >  2 files changed, 30 insertions(+), 16 deletions(-)
> >
> > diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
> > index 008a8a2..8ba676e 100644
> > --- a/src/mds/MDCache.cc
> > +++ b/src/mds/MDCache.cc
> > @@ -3538,6 +3538,7 @@ void MDCache::rejoin_send_rejoins()
> >        if (p->first == 0 && root) {
> >         p->second->add_weak_inode(root->vino());
> >         p->second->add_strong_inode(root->vino(),
> > +                                   root->get_replica_nonce(),
> >                                     root->get_caps_wanted(),
> >                                     root->filelock.get_state(),
> >                                     root->nestlock.get_state(),
> > @@ -3551,6 +3552,7 @@ void MDCache::rejoin_send_rejoins()
> >        if (CInode *in = get_inode(MDS_INO_MDSDIR(p->first))) {
> >         p->second->add_weak_inode(in->vino());
> >         p->second->add_strong_inode(in->vino(),
> > +                                   in->get_replica_nonce(),
> >                                     in->get_caps_wanted(),
> >                                     in->filelock.get_state(),
> >                                     in->nestlock.get_state(),
> > @@ -3709,6 +3711,7 @@ void MDCache::rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin)
> >         CInode *in = dnl->get_inode();
> >         dout(15) << " add_strong_inode " << *in << dendl;
> >         rejoin->add_strong_inode(in->vino(),
> > +                                in->get_replica_nonce(),
> >                                  in->get_caps_wanted(),
> >                                  in->filelock.get_state(),
> >                                  in->nestlock.get_state(),
> > @@ -4248,7 +4251,7 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
> >         dir = rejoin_invent_dirfrag(p->first);
> >      }
> >      if (dir) {
> > -      dir->add_replica(from);
> > +      dir->add_replica(from, p->second.nonce);
> >        dir->dir_rep = p->second.dir_rep;
> >      } else {
> >        dout(10) << " frag " << p->first << " doesn't match dirfragtree " << *diri << dendl;
> > @@ -4263,7 +4266,7 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
> >           dir = rejoin_invent_dirfrag(p->first);
> >         else
> >           dout(10) << " have(approx) " << *dir << dendl;
> > -       dir->add_replica(from);
> > +       dir->add_replica(from, p->second.nonce);
> >         dir->dir_rep = p->second.dir_rep;
> >        }
> >        refragged = true;
> > @@ -4327,7 +4330,7 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
> >         mdr->locks.insert(&dn->lock);
> >        }
> >
> > -      dn->add_replica(from);
> > +      dn->add_replica(from, q->second.nonce);
> >        dout(10) << " have " << *dn << dendl;
> >
> >        // inode?
> > @@ -4412,7 +4415,7 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
> >           dout(10) << " sender has dentry but not inode, adding them as a replica" << dendl;
> >         }
> >
> > -       in->add_replica(from);
> > +       in->add_replica(from, p->second.nonce);
> >         dout(10) << " have " << *in << dendl;
> >        }
> >      }
> > @@ -5176,7 +5179,7 @@ void MDCache::rejoin_send_acks()
> >        for (map<int,int>::iterator r = dir->replicas_begin();
> >            r != dir->replicas_end();
> >            ++r)
> > -       ack[r->first]->add_strong_dirfrag(dir->dirfrag(), r->second, dir->dir_rep);
> > +       ack[r->first]->add_strong_dirfrag(dir->dirfrag(), ++r->second, dir->dir_rep);
> >
> >        for (CDir::map_t::iterator q = dir->items.begin();
> >            q != dir->items.end();
> > @@ -5192,7 +5195,7 @@ void MDCache::rejoin_send_acks()
> >                                            dnl->is_primary() ? dnl->get_inode()->ino():inodeno_t(0),
> >                                            dnl->is_remote() ? dnl->get_remote_ino():inodeno_t(0),
> >                                            dnl->is_remote() ? dnl->get_remote_d_type():0,
> > -                                          r->second,
> > +                                          ++r->second,
> >                                            dn->lock.get_replica_state());
> >
> >         if (!dnl->is_primary())
> > @@ -5205,7 +5208,7 @@ void MDCache::rejoin_send_acks()
> >              r != in->replicas_end();
> >              ++r) {
> >           ack[r->first]->add_inode_base(in);
> > -         ack[r->first]->add_inode_locks(in, r->second);
> > +         ack[r->first]->add_inode_locks(in, ++r->second);
> >         }
> >
> >         // subdirs in this subtree?
> > @@ -5220,14 +5223,14 @@ void MDCache::rejoin_send_acks()
> >          r != root->replicas_end();
> >          ++r) {
> >        ack[r->first]->add_inode_base(root);
> > -      ack[r->first]->add_inode_locks(root, r->second);
> > +      ack[r->first]->add_inode_locks(root, ++r->second);
> >      }
> >    if (myin)
> >      for (map<int,int>::iterator r = myin->replicas_begin();
> >          r != myin->replicas_end();
> >          ++r) {
> >        ack[r->first]->add_inode_base(myin);
> > -      ack[r->first]->add_inode_locks(myin, r->second);
> > +      ack[r->first]->add_inode_locks(myin, ++r->second);
> >      }
> >
> >    // include inode base for any inodes whose scatterlocks may have updated
> > @@ -5728,6 +5731,12 @@ void MDCache::send_expire_messages(map<int, MCacheExpire*>& expiremap)
> >    for (map<int, MCacheExpire*>::iterator it = expiremap.begin();
> >         it != expiremap.end();
> >         ++it) {
> > +    if (mds->mdsmap->get_state(it->first) < MDSMap::STATE_REJOIN ||
> > +       (mds->mdsmap->get_state(it->first) == MDSMap::STATE_REJOIN &&
> > +        rejoin_sent.count(it->first) == 0)) {
> > +      it->second->put();
> > +      continue;
> > +    }
> >      dout(7) << "sending cache_expire to " << it->first << dendl;
> >      mds->send_message_mds(it->second, it->first);
> >    }
> > @@ -9640,9 +9649,11 @@ void MDCache::handle_dentry_link(MDentryLink *m)
> >      CInode *in = add_replica_inode(p, NULL, finished);
> >      assert(in->get_num_ref() == 0);
> >      assert(in->get_parent_dn() == NULL);
> > -    MCacheExpire* expire = new MCacheExpire(mds->get_nodeid());
> > -    expire->add_inode(m->get_subtree(), in->vino(), in->get_replica_nonce());
> > -    mds->send_message_mds(expire, m->get_source().num());
> > +    map<int, MCacheExpire*> expiremap;
> > +    int from = m->get_source().num();
> > +    expiremap[from] = new MCacheExpire(mds->get_nodeid());
> > +    expiremap[from]->add_inode(m->get_subtree(), in->vino(), in->get_replica_nonce());
> > +    send_expire_messages(expiremap);
> >      remove_inode(in);
> >    }
> >
> > diff --git a/src/messages/MMDSCacheRejoin.h b/src/messages/MMDSCacheRejoin.h
> > index 825400d..b88f551 100644
> > --- a/src/messages/MMDSCacheRejoin.h
> > +++ b/src/messages/MMDSCacheRejoin.h
> > @@ -43,19 +43,22 @@ class MMDSCacheRejoin : public Message {
> >
> >    // -- types --
> >    struct inode_strong {
> > +    int32_t nonce;
> >      int32_t caps_wanted;
> >      int32_t filelock, nestlock, dftlock;
> >      inode_strong() {}
> > -    inode_strong(int cw, int dl, int nl, int dftl) :
> > -      caps_wanted(cw),
> > +    inode_strong(int n, int cw, int dl, int nl, int dftl) :
> > +      nonce(n), caps_wanted(cw),
> >        filelock(dl), nestlock(nl), dftlock(dftl) { }
> >      void encode(bufferlist &bl) const {
> > +      ::encode(nonce, bl);
> >        ::encode(caps_wanted, bl);
> >        ::encode(filelock, bl);
> >        ::encode(nestlock, bl);
> >        ::encode(dftlock, bl);
> >      }
> >      void decode(bufferlist::iterator &bl) {
> > +      ::decode(nonce, bl);
> >        ::decode(caps_wanted, bl);
> >        ::decode(filelock, bl);
> >        ::decode(nestlock, bl);
> 
> This is a wire format change without any versioning to cover it ?
> we're going to need to at a minimum add feature bits to cover this. It
> might be more appropriate to introduce proper versioning at the same
> time, though. You should find examples of everything you need in my
> recent encoding changes.

Since we're not too concerned about rolling upgrades for the mds cluster, 
we could just bump the CEPH_MDSC_PROTOCOL instead of spending another 
feature bit (we're halfway through them!).

...but we should also move to the new encoding macros opportunistically :)

s

> 
> The rest looks good.
> -Greg
> 
> > @@ -208,8 +211,8 @@ public:
> >    void add_weak_inode(vinodeno_t i) {
> >      weak_inodes.insert(i);
> >    }
> > -  void add_strong_inode(vinodeno_t i, int cw, int dl, int nl, int dftl) {
> > -    strong_inodes[i] = inode_strong(cw, dl, nl, dftl);
> > +  void add_strong_inode(vinodeno_t i, int n, int cw, int dl, int nl, int dftl) {
> > +    strong_inodes[i] = inode_strong(n, cw, dl, nl, dftl);
> >    }
> >    void add_inode_locks(CInode *in, __u32 nonce) {
> >      ::encode(in->inode.ino, inode_locks);
> --
> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> 
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index 008a8a2..8ba676e 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -3538,6 +3538,7 @@  void MDCache::rejoin_send_rejoins()
       if (p->first == 0 && root) {
 	p->second->add_weak_inode(root->vino());
 	p->second->add_strong_inode(root->vino(),
+				    root->get_replica_nonce(),
 				    root->get_caps_wanted(),
 				    root->filelock.get_state(),
 				    root->nestlock.get_state(),
@@ -3551,6 +3552,7 @@  void MDCache::rejoin_send_rejoins()
       if (CInode *in = get_inode(MDS_INO_MDSDIR(p->first))) {
 	p->second->add_weak_inode(in->vino());
 	p->second->add_strong_inode(in->vino(),
+				    in->get_replica_nonce(),
 				    in->get_caps_wanted(),
 				    in->filelock.get_state(),
 				    in->nestlock.get_state(),
@@ -3709,6 +3711,7 @@  void MDCache::rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin)
 	CInode *in = dnl->get_inode();
 	dout(15) << " add_strong_inode " << *in << dendl;
 	rejoin->add_strong_inode(in->vino(),
+				 in->get_replica_nonce(),
 				 in->get_caps_wanted(),
 				 in->filelock.get_state(),
 				 in->nestlock.get_state(),
@@ -4248,7 +4251,7 @@  void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
 	dir = rejoin_invent_dirfrag(p->first);
     }
     if (dir) {
-      dir->add_replica(from);
+      dir->add_replica(from, p->second.nonce);
       dir->dir_rep = p->second.dir_rep;
     } else {
       dout(10) << " frag " << p->first << " doesn't match dirfragtree " << *diri << dendl;
@@ -4263,7 +4266,7 @@  void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
 	  dir = rejoin_invent_dirfrag(p->first);
 	else
 	  dout(10) << " have(approx) " << *dir << dendl;
-	dir->add_replica(from);
+	dir->add_replica(from, p->second.nonce);
 	dir->dir_rep = p->second.dir_rep;
       }
       refragged = true;
@@ -4327,7 +4330,7 @@  void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
 	mdr->locks.insert(&dn->lock);
       }
 
-      dn->add_replica(from);
+      dn->add_replica(from, q->second.nonce);
       dout(10) << " have " << *dn << dendl;
       
       // inode?
@@ -4412,7 +4415,7 @@  void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
 	  dout(10) << " sender has dentry but not inode, adding them as a replica" << dendl;
 	}
 	
-	in->add_replica(from);
+	in->add_replica(from, p->second.nonce);
 	dout(10) << " have " << *in << dendl;
       }
     }
@@ -5176,7 +5179,7 @@  void MDCache::rejoin_send_acks()
       for (map<int,int>::iterator r = dir->replicas_begin();
 	   r != dir->replicas_end();
 	   ++r) 
-	ack[r->first]->add_strong_dirfrag(dir->dirfrag(), r->second, dir->dir_rep);
+	ack[r->first]->add_strong_dirfrag(dir->dirfrag(), ++r->second, dir->dir_rep);
 	   
       for (CDir::map_t::iterator q = dir->items.begin();
 	   q != dir->items.end();
@@ -5192,7 +5195,7 @@  void MDCache::rejoin_send_acks()
 					   dnl->is_primary() ? dnl->get_inode()->ino():inodeno_t(0),
 					   dnl->is_remote() ? dnl->get_remote_ino():inodeno_t(0),
 					   dnl->is_remote() ? dnl->get_remote_d_type():0,
-					   r->second,
+					   ++r->second,
 					   dn->lock.get_replica_state());
 	
 	if (!dnl->is_primary())
@@ -5205,7 +5208,7 @@  void MDCache::rejoin_send_acks()
 	     r != in->replicas_end();
 	     ++r) {
 	  ack[r->first]->add_inode_base(in);
-	  ack[r->first]->add_inode_locks(in, r->second);
+	  ack[r->first]->add_inode_locks(in, ++r->second);
 	}
 	
 	// subdirs in this subtree?
@@ -5220,14 +5223,14 @@  void MDCache::rejoin_send_acks()
 	 r != root->replicas_end();
 	 ++r) {
       ack[r->first]->add_inode_base(root);
-      ack[r->first]->add_inode_locks(root, r->second);
+      ack[r->first]->add_inode_locks(root, ++r->second);
     }
   if (myin)
     for (map<int,int>::iterator r = myin->replicas_begin();
 	 r != myin->replicas_end();
 	 ++r) {
       ack[r->first]->add_inode_base(myin);
-      ack[r->first]->add_inode_locks(myin, r->second);
+      ack[r->first]->add_inode_locks(myin, ++r->second);
     }
 
   // include inode base for any inodes whose scatterlocks may have updated
@@ -5728,6 +5731,12 @@  void MDCache::send_expire_messages(map<int, MCacheExpire*>& expiremap)
   for (map<int, MCacheExpire*>::iterator it = expiremap.begin();
        it != expiremap.end();
        ++it) {
+    if (mds->mdsmap->get_state(it->first) < MDSMap::STATE_REJOIN ||
+	(mds->mdsmap->get_state(it->first) == MDSMap::STATE_REJOIN &&
+	 rejoin_sent.count(it->first) == 0)) {
+      it->second->put();
+      continue;
+    }
     dout(7) << "sending cache_expire to " << it->first << dendl;
     mds->send_message_mds(it->second, it->first);
   }
@@ -9640,9 +9649,11 @@  void MDCache::handle_dentry_link(MDentryLink *m)
     CInode *in = add_replica_inode(p, NULL, finished);
     assert(in->get_num_ref() == 0);
     assert(in->get_parent_dn() == NULL);
-    MCacheExpire* expire = new MCacheExpire(mds->get_nodeid());
-    expire->add_inode(m->get_subtree(), in->vino(), in->get_replica_nonce());
-    mds->send_message_mds(expire, m->get_source().num());
+    map<int, MCacheExpire*> expiremap;
+    int from = m->get_source().num();
+    expiremap[from] = new MCacheExpire(mds->get_nodeid());
+    expiremap[from]->add_inode(m->get_subtree(), in->vino(), in->get_replica_nonce());
+    send_expire_messages(expiremap);
     remove_inode(in);
   }
 
diff --git a/src/messages/MMDSCacheRejoin.h b/src/messages/MMDSCacheRejoin.h
index 825400d..b88f551 100644
--- a/src/messages/MMDSCacheRejoin.h
+++ b/src/messages/MMDSCacheRejoin.h
@@ -43,19 +43,22 @@  class MMDSCacheRejoin : public Message {
 
   // -- types --
   struct inode_strong { 
+    int32_t nonce;
     int32_t caps_wanted;
     int32_t filelock, nestlock, dftlock;
     inode_strong() {}
-    inode_strong(int cw, int dl, int nl, int dftl) : 
-      caps_wanted(cw),
+    inode_strong(int n, int cw, int dl, int nl, int dftl) :
+      nonce(n), caps_wanted(cw),
       filelock(dl), nestlock(nl), dftlock(dftl) { }
     void encode(bufferlist &bl) const {
+      ::encode(nonce, bl);
       ::encode(caps_wanted, bl);
       ::encode(filelock, bl);
       ::encode(nestlock, bl);
       ::encode(dftlock, bl);
     }
     void decode(bufferlist::iterator &bl) {
+      ::decode(nonce, bl);
       ::decode(caps_wanted, bl);
       ::decode(filelock, bl);
       ::decode(nestlock, bl);
@@ -208,8 +211,8 @@  public:
   void add_weak_inode(vinodeno_t i) {
     weak_inodes.insert(i);
   }
-  void add_strong_inode(vinodeno_t i, int cw, int dl, int nl, int dftl) {
-    strong_inodes[i] = inode_strong(cw, dl, nl, dftl);
+  void add_strong_inode(vinodeno_t i, int n, int cw, int dl, int nl, int dftl) {
+    strong_inodes[i] = inode_strong(n, cw, dl, nl, dftl);
   }
   void add_inode_locks(CInode *in, __u32 nonce) {
     ::encode(in->inode.ino, inode_locks);