Message ID | 20170225174323.20289-4-jlayton@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Sat, Feb 25, 2017 at 6:43 PM, Jeff Layton <jlayton@redhat.com> wrote: > Cephfs can get cap update requests that contain a new epoch barrier in > them. When that happens we want to pause all OSD traffic until the right > map epoch arrives. > > Add an epoch_barrier field to ceph_osd_client that is protected by the > osdc->lock rwsem. When the barrier is set, and the current OSD map > epoch is below that, pause the request target when submitting the > request or when revisiting it. Add a way for upper layers (cephfs) > to update the epoch_barrier as well. > > If we get a new map, compare the new epoch against the barrier before > kicking requests and request another map if the map epoch is still lower > than the one we want. > > If we end up cancelling requests because of a new map showing a full OSD > or pool condition, then set the barrier higher than the highest replay > epoch of all the cancelled requests. > > Signed-off-by: Jeff Layton <jlayton@redhat.com> > --- > include/linux/ceph/osd_client.h | 2 ++ > net/ceph/osd_client.c | 51 +++++++++++++++++++++++++++++++++-------- > 2 files changed, 43 insertions(+), 10 deletions(-) > > diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h > index 55dcff2455e0..833942226560 100644 > --- a/include/linux/ceph/osd_client.h > +++ b/include/linux/ceph/osd_client.h > @@ -266,6 +266,7 @@ struct ceph_osd_client { > struct rb_root osds; /* osds */ > struct list_head osd_lru; /* idle osds */ > spinlock_t osd_lru_lock; > + u32 epoch_barrier; It would be good to include it in debugfs -- osdmap_show(). > struct ceph_osd homeless_osd; > atomic64_t last_tid; /* tid of last request */ > u64 last_linger_id; > @@ -304,6 +305,7 @@ extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc, > struct ceph_msg *msg); > extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc, > struct ceph_msg *msg); > +void ceph_osdc_update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb); > > extern void osd_req_op_init(struct ceph_osd_request *osd_req, > unsigned int which, u16 opcode, u32 flags); > diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c > index b286ff6dec29..2e9b6211814a 100644 > --- a/net/ceph/osd_client.c > +++ b/net/ceph/osd_client.c > @@ -1299,8 +1299,10 @@ static bool target_should_be_paused(struct ceph_osd_client *osdc, > __pool_full(pi); > > WARN_ON(pi->id != t->base_oloc.pool); > - return (t->flags & CEPH_OSD_FLAG_READ && pauserd) || > - (t->flags & CEPH_OSD_FLAG_WRITE && pausewr); > + return ((t->flags & CEPH_OSD_FLAG_READ) && pauserd) || > + ((t->flags & CEPH_OSD_FLAG_WRITE) && pausewr) || > + (osdc->epoch_barrier && > + osdc->osdmap->epoch < osdc->epoch_barrier); Why the osdc->epoch_barrier != 0 check, here and everywhere else? > } > > enum calc_target_result { > @@ -1610,21 +1612,24 @@ static void send_request(struct ceph_osd_request *req) > static void maybe_request_map(struct ceph_osd_client *osdc) > { > bool continuous = false; > + u32 epoch = osdc->osdmap->epoch; > > verify_osdc_locked(osdc); > - WARN_ON(!osdc->osdmap->epoch); > + WARN_ON_ONCE(epoch == 0); > > if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || > ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD) || > - ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR)) { > + ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR) || > + (osdc->epoch_barrier && epoch < osdc->epoch_barrier)) { > dout("%s osdc %p continuous\n", __func__, osdc); > continuous = true; > } else { > dout("%s osdc %p onetime\n", __func__, osdc); > } > > + ++epoch; > if (ceph_monc_want_map(&osdc->client->monc, CEPH_SUB_OSDMAP, > - osdc->osdmap->epoch + 1, continuous)) > + epoch, continuous)) > ceph_monc_renew_subs(&osdc->client->monc); > } Why was this change needed? Wouldn't the existing call to unmodified maybe_request_map() from ceph_osdc_handle_map() be sufficient? > > @@ -1653,8 +1658,14 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked) > goto promote; > } > > - if ((req->r_flags & CEPH_OSD_FLAG_WRITE) && > - ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR)) { > + if (osdc->epoch_barrier && > + osdc->osdmap->epoch < osdc->epoch_barrier) { > + dout("req %p epoch %u barrier %u\n", req, osdc->osdmap->epoch, > + osdc->epoch_barrier); > + req->r_t.paused = true; > + maybe_request_map(osdc); > + } else if ((req->r_flags & CEPH_OSD_FLAG_WRITE) && > + ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR)) { > dout("req %p pausewr\n", req); > req->r_t.paused = true; > maybe_request_map(osdc); > @@ -1772,7 +1783,8 @@ static void complete_request(struct ceph_osd_request *req, int err) > > /* > * Drop all pending requests that are stalled waiting on a full condition to > - * clear, and complete them with ENOSPC as the return code. > + * clear, and complete them with ENOSPC as the return code. Set the > + * osdc->epoch_barrier to the latest replay version epoch that was aborted. > */ > static void ceph_osdc_abort_on_full(struct ceph_osd_client *osdc) > { > @@ -1808,7 +1820,11 @@ static void ceph_osdc_abort_on_full(struct ceph_osd_client *osdc) > mutex_unlock(&osd->lock); > } > out: > - dout("return abort_on_full latest_epoch=%u\n", latest_epoch); > + if (latest_epoch) > + osdc->epoch_barrier = max(latest_epoch + 1, > + osdc->epoch_barrier); > + dout("return abort_on_full latest_epoch=%u barrier=%u\n", latest_epoch, > + osdc->epoch_barrier); > } > > static void cancel_map_check(struct ceph_osd_request *req) > @@ -3266,7 +3282,8 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) > pausewr = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR) || > ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || > have_pool_full(osdc); > - if (was_pauserd || was_pausewr || pauserd || pausewr) > + if (was_pauserd || was_pausewr || pauserd || pausewr || > + (osdc->epoch_barrier && osdc->osdmap->epoch < osdc->epoch_barrier)) > maybe_request_map(osdc); > > kick_requests(osdc, &need_resend, &need_resend_linger); > @@ -3284,6 +3301,20 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) > up_write(&osdc->lock); > } > > +void ceph_osdc_update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb) > +{ > + down_read(&osdc->lock); > + if (unlikely(eb > osdc->epoch_barrier)) { > + up_read(&osdc->lock); > + down_write(&osdc->lock); > + osdc->epoch_barrier = max(eb, osdc->epoch_barrier); I'd replace this with an if and add a dout for when the epoch_barrier is actually updated. We should probably do maybe_request_map() in that branch is well. Thanks, Ilya -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, 2017-03-28 at 16:54 +0200, Ilya Dryomov wrote: > On Sat, Feb 25, 2017 at 6:43 PM, Jeff Layton <jlayton@redhat.com> wrote: > > Cephfs can get cap update requests that contain a new epoch barrier in > > them. When that happens we want to pause all OSD traffic until the right > > map epoch arrives. > > > > Add an epoch_barrier field to ceph_osd_client that is protected by the > > osdc->lock rwsem. When the barrier is set, and the current OSD map > > epoch is below that, pause the request target when submitting the > > request or when revisiting it. Add a way for upper layers (cephfs) > > to update the epoch_barrier as well. > > > > If we get a new map, compare the new epoch against the barrier before > > kicking requests and request another map if the map epoch is still lower > > than the one we want. > > > > If we end up cancelling requests because of a new map showing a full OSD > > or pool condition, then set the barrier higher than the highest replay > > epoch of all the cancelled requests. > > > > Signed-off-by: Jeff Layton <jlayton@redhat.com> > > --- > > include/linux/ceph/osd_client.h | 2 ++ > > net/ceph/osd_client.c | 51 +++++++++++++++++++++++++++++++++-------- > > 2 files changed, 43 insertions(+), 10 deletions(-) > > > > diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h > > index 55dcff2455e0..833942226560 100644 > > --- a/include/linux/ceph/osd_client.h > > +++ b/include/linux/ceph/osd_client.h > > @@ -266,6 +266,7 @@ struct ceph_osd_client { > > struct rb_root osds; /* osds */ > > struct list_head osd_lru; /* idle osds */ > > spinlock_t osd_lru_lock; > > + u32 epoch_barrier; > > It would be good to include it in debugfs -- osdmap_show(). > Ok, I'll plan to add it in there. > > struct ceph_osd homeless_osd; > > atomic64_t last_tid; /* tid of last request */ > > u64 last_linger_id; > > @@ -304,6 +305,7 @@ extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc, > > struct ceph_msg *msg); > > extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc, > > struct ceph_msg *msg); > > +void ceph_osdc_update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb); > > > > extern void osd_req_op_init(struct ceph_osd_request *osd_req, > > unsigned int which, u16 opcode, u32 flags); > > diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c > > index b286ff6dec29..2e9b6211814a 100644 > > --- a/net/ceph/osd_client.c > > +++ b/net/ceph/osd_client.c > > @@ -1299,8 +1299,10 @@ static bool target_should_be_paused(struct ceph_osd_client *osdc, > > __pool_full(pi); > > > > WARN_ON(pi->id != t->base_oloc.pool); > > - return (t->flags & CEPH_OSD_FLAG_READ && pauserd) || > > - (t->flags & CEPH_OSD_FLAG_WRITE && pausewr); > > + return ((t->flags & CEPH_OSD_FLAG_READ) && pauserd) || > > + ((t->flags & CEPH_OSD_FLAG_WRITE) && pausewr) || > > + (osdc->epoch_barrier && > > + osdc->osdmap->epoch < osdc->epoch_barrier); > > Why the osdc->epoch_barrier != 0 check, here and everywhere else? > My understanding is that we have to deal with clusters that predate the addition of this value. The client sets this to 0 when it's not set. > > } > > > > enum calc_target_result { > > @@ -1610,21 +1612,24 @@ static void send_request(struct ceph_osd_request *req) > > static void maybe_request_map(struct ceph_osd_client *osdc) > > { > > bool continuous = false; > > + u32 epoch = osdc->osdmap->epoch; > > > > verify_osdc_locked(osdc); > > - WARN_ON(!osdc->osdmap->epoch); > > + WARN_ON_ONCE(epoch == 0); > > > > if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || > > ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD) || > > - ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR)) { > > + ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR) || > > + (osdc->epoch_barrier && epoch < osdc->epoch_barrier)) { > > dout("%s osdc %p continuous\n", __func__, osdc); > > continuous = true; > > } else { > > dout("%s osdc %p onetime\n", __func__, osdc); > > } > > > > + ++epoch; > > if (ceph_monc_want_map(&osdc->client->monc, CEPH_SUB_OSDMAP, > > - osdc->osdmap->epoch + 1, continuous)) > > + epoch, continuous)) > > ceph_monc_renew_subs(&osdc->client->monc); > > } > > Why was this change needed? Wouldn't the existing call to unmodified > maybe_request_map() from ceph_osdc_handle_map() be sufficient? > It's not strictly required, but it's more efficient to fetch the epoch at the beginning of the function like this and then work with it the value on the stack than to potentially deal with having to refetch it. It also looks cleaner. > > > > @@ -1653,8 +1658,14 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked) > > goto promote; > > } > > > > - if ((req->r_flags & CEPH_OSD_FLAG_WRITE) && > > - ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR)) { > > + if (osdc->epoch_barrier && > > + osdc->osdmap->epoch < osdc->epoch_barrier) { > > + dout("req %p epoch %u barrier %u\n", req, osdc->osdmap->epoch, > > + osdc->epoch_barrier); > > + req->r_t.paused = true; > > + maybe_request_map(osdc); > > + } else if ((req->r_flags & CEPH_OSD_FLAG_WRITE) && > > + ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR)) { > > dout("req %p pausewr\n", req); > > req->r_t.paused = true; > > maybe_request_map(osdc); > > @@ -1772,7 +1783,8 @@ static void complete_request(struct ceph_osd_request *req, int err) > > > > /* > > * Drop all pending requests that are stalled waiting on a full condition to > > - * clear, and complete them with ENOSPC as the return code. > > + * clear, and complete them with ENOSPC as the return code. Set the > > + * osdc->epoch_barrier to the latest replay version epoch that was aborted. > > */ > > static void ceph_osdc_abort_on_full(struct ceph_osd_client *osdc) > > { > > @@ -1808,7 +1820,11 @@ static void ceph_osdc_abort_on_full(struct ceph_osd_client *osdc) > > mutex_unlock(&osd->lock); > > } > > out: > > - dout("return abort_on_full latest_epoch=%u\n", latest_epoch); > > + if (latest_epoch) > > + osdc->epoch_barrier = max(latest_epoch + 1, > > + osdc->epoch_barrier); > > + dout("return abort_on_full latest_epoch=%u barrier=%u\n", latest_epoch, > > + osdc->epoch_barrier); > > } > > > > static void cancel_map_check(struct ceph_osd_request *req) > > @@ -3266,7 +3282,8 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) > > pausewr = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR) || > > ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || > > have_pool_full(osdc); > > - if (was_pauserd || was_pausewr || pauserd || pausewr) > > + if (was_pauserd || was_pausewr || pauserd || pausewr || > > + (osdc->epoch_barrier && osdc->osdmap->epoch < osdc->epoch_barrier)) > > maybe_request_map(osdc); > > > > kick_requests(osdc, &need_resend, &need_resend_linger); > > @@ -3284,6 +3301,20 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) > > up_write(&osdc->lock); > > } > > > > +void ceph_osdc_update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb) > > +{ > > + down_read(&osdc->lock); > > + if (unlikely(eb > osdc->epoch_barrier)) { > > + up_read(&osdc->lock); > > + down_write(&osdc->lock); > > + osdc->epoch_barrier = max(eb, osdc->epoch_barrier); > > I'd replace this with an if and add a dout for when the epoch_barrier > is actually updated. We should probably do maybe_request_map() in that > branch is well. > Ok, thanks. I'll clean that up.
On Wed, Mar 29, 2017 at 1:54 PM, Jeff Layton <jlayton@redhat.com> wrote: > On Tue, 2017-03-28 at 16:54 +0200, Ilya Dryomov wrote: >> On Sat, Feb 25, 2017 at 6:43 PM, Jeff Layton <jlayton@redhat.com> wrote: >> > Cephfs can get cap update requests that contain a new epoch barrier in >> > them. When that happens we want to pause all OSD traffic until the right >> > map epoch arrives. >> > >> > Add an epoch_barrier field to ceph_osd_client that is protected by the >> > osdc->lock rwsem. When the barrier is set, and the current OSD map >> > epoch is below that, pause the request target when submitting the >> > request or when revisiting it. Add a way for upper layers (cephfs) >> > to update the epoch_barrier as well. >> > >> > If we get a new map, compare the new epoch against the barrier before >> > kicking requests and request another map if the map epoch is still lower >> > than the one we want. >> > >> > If we end up cancelling requests because of a new map showing a full OSD >> > or pool condition, then set the barrier higher than the highest replay >> > epoch of all the cancelled requests. >> > >> > Signed-off-by: Jeff Layton <jlayton@redhat.com> >> > --- >> > include/linux/ceph/osd_client.h | 2 ++ >> > net/ceph/osd_client.c | 51 +++++++++++++++++++++++++++++++++-------- >> > 2 files changed, 43 insertions(+), 10 deletions(-) >> > >> > diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h >> > index 55dcff2455e0..833942226560 100644 >> > --- a/include/linux/ceph/osd_client.h >> > +++ b/include/linux/ceph/osd_client.h >> > @@ -266,6 +266,7 @@ struct ceph_osd_client { >> > struct rb_root osds; /* osds */ >> > struct list_head osd_lru; /* idle osds */ >> > spinlock_t osd_lru_lock; >> > + u32 epoch_barrier; >> >> It would be good to include it in debugfs -- osdmap_show(). >> > > Ok, I'll plan to add it in there. > >> > struct ceph_osd homeless_osd; >> > atomic64_t last_tid; /* tid of last request */ >> > u64 last_linger_id; >> > @@ -304,6 +305,7 @@ extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc, >> > struct ceph_msg *msg); >> > extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc, >> > struct ceph_msg *msg); >> > +void ceph_osdc_update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb); >> > >> > extern void osd_req_op_init(struct ceph_osd_request *osd_req, >> > unsigned int which, u16 opcode, u32 flags); >> > diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c >> > index b286ff6dec29..2e9b6211814a 100644 >> > --- a/net/ceph/osd_client.c >> > +++ b/net/ceph/osd_client.c >> > @@ -1299,8 +1299,10 @@ static bool target_should_be_paused(struct ceph_osd_client *osdc, >> > __pool_full(pi); >> > >> > WARN_ON(pi->id != t->base_oloc.pool); >> > - return (t->flags & CEPH_OSD_FLAG_READ && pauserd) || >> > - (t->flags & CEPH_OSD_FLAG_WRITE && pausewr); >> > + return ((t->flags & CEPH_OSD_FLAG_READ) && pauserd) || >> > + ((t->flags & CEPH_OSD_FLAG_WRITE) && pausewr) || >> > + (osdc->epoch_barrier && >> > + osdc->osdmap->epoch < osdc->epoch_barrier); >> >> Why the osdc->epoch_barrier != 0 check, here and everywhere else? >> > > My understanding is that we have to deal with clusters that predate the > addition of this value. The client sets this to 0 when it's not set. That and initialization to 0 in ceph_create_client(), so how does this != 0 check affect anything? Won't we always return false in that case, thus preserving the old behavior? > >> > } >> > >> > enum calc_target_result { >> > @@ -1610,21 +1612,24 @@ static void send_request(struct ceph_osd_request *req) >> > static void maybe_request_map(struct ceph_osd_client *osdc) >> > { >> > bool continuous = false; >> > + u32 epoch = osdc->osdmap->epoch; >> > >> > verify_osdc_locked(osdc); >> > - WARN_ON(!osdc->osdmap->epoch); >> > + WARN_ON_ONCE(epoch == 0); >> > >> > if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || >> > ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD) || >> > - ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR)) { >> > + ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR) || >> > + (osdc->epoch_barrier && epoch < osdc->epoch_barrier)) { >> > dout("%s osdc %p continuous\n", __func__, osdc); >> > continuous = true; >> > } else { >> > dout("%s osdc %p onetime\n", __func__, osdc); >> > } >> > >> > + ++epoch; >> > if (ceph_monc_want_map(&osdc->client->monc, CEPH_SUB_OSDMAP, >> > - osdc->osdmap->epoch + 1, continuous)) >> > + epoch, continuous)) >> > ceph_monc_renew_subs(&osdc->client->monc); >> > } >> >> Why was this change needed? Wouldn't the existing call to unmodified >> maybe_request_map() from ceph_osdc_handle_map() be sufficient? >> > > It's not strictly required, but it's more efficient to fetch the epoch > at the beginning of the function like this and then work with it the > value on the stack than to potentially deal with having to refetch it. > It also looks cleaner. It was more about the if condition change. I don't see a reason for it and it's not present in the Objecter counterpart, so I'd rather we drop the entire hunk. Thanks, Ilya -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, 2017-03-29 at 18:52 +0200, Ilya Dryomov wrote: > On Wed, Mar 29, 2017 at 1:54 PM, Jeff Layton <jlayton@redhat.com> wrote: > > On Tue, 2017-03-28 at 16:54 +0200, Ilya Dryomov wrote: > > > On Sat, Feb 25, 2017 at 6:43 PM, Jeff Layton <jlayton@redhat.com> wrote: > > > > Cephfs can get cap update requests that contain a new epoch barrier in > > > > them. When that happens we want to pause all OSD traffic until the right > > > > map epoch arrives. > > > > > > > > Add an epoch_barrier field to ceph_osd_client that is protected by the > > > > osdc->lock rwsem. When the barrier is set, and the current OSD map > > > > epoch is below that, pause the request target when submitting the > > > > request or when revisiting it. Add a way for upper layers (cephfs) > > > > to update the epoch_barrier as well. > > > > > > > > If we get a new map, compare the new epoch against the barrier before > > > > kicking requests and request another map if the map epoch is still lower > > > > than the one we want. > > > > > > > > If we end up cancelling requests because of a new map showing a full OSD > > > > or pool condition, then set the barrier higher than the highest replay > > > > epoch of all the cancelled requests. > > > > > > > > Signed-off-by: Jeff Layton <jlayton@redhat.com> > > > > --- > > > > include/linux/ceph/osd_client.h | 2 ++ > > > > net/ceph/osd_client.c | 51 +++++++++++++++++++++++++++++++++-------- > > > > 2 files changed, 43 insertions(+), 10 deletions(-) > > > > > > > > diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h > > > > index 55dcff2455e0..833942226560 100644 > > > > --- a/include/linux/ceph/osd_client.h > > > > +++ b/include/linux/ceph/osd_client.h > > > > @@ -266,6 +266,7 @@ struct ceph_osd_client { > > > > struct rb_root osds; /* osds */ > > > > struct list_head osd_lru; /* idle osds */ > > > > spinlock_t osd_lru_lock; > > > > + u32 epoch_barrier; > > > > > > It would be good to include it in debugfs -- osdmap_show(). > > > > > > > Ok, I'll plan to add it in there. > > > > > > struct ceph_osd homeless_osd; > > > > atomic64_t last_tid; /* tid of last request */ > > > > u64 last_linger_id; > > > > @@ -304,6 +305,7 @@ extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc, > > > > struct ceph_msg *msg); > > > > extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc, > > > > struct ceph_msg *msg); > > > > +void ceph_osdc_update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb); > > > > > > > > extern void osd_req_op_init(struct ceph_osd_request *osd_req, > > > > unsigned int which, u16 opcode, u32 flags); > > > > diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c > > > > index b286ff6dec29..2e9b6211814a 100644 > > > > --- a/net/ceph/osd_client.c > > > > +++ b/net/ceph/osd_client.c > > > > @@ -1299,8 +1299,10 @@ static bool target_should_be_paused(struct ceph_osd_client *osdc, > > > > __pool_full(pi); > > > > > > > > WARN_ON(pi->id != t->base_oloc.pool); > > > > - return (t->flags & CEPH_OSD_FLAG_READ && pauserd) || > > > > - (t->flags & CEPH_OSD_FLAG_WRITE && pausewr); > > > > + return ((t->flags & CEPH_OSD_FLAG_READ) && pauserd) || > > > > + ((t->flags & CEPH_OSD_FLAG_WRITE) && pausewr) || > > > > + (osdc->epoch_barrier && > > > > + osdc->osdmap->epoch < osdc->epoch_barrier); > > > > > > Why the osdc->epoch_barrier != 0 check, here and everywhere else? > > > > > > > My understanding is that we have to deal with clusters that predate the > > addition of this value. The client sets this to 0 when it's not set. > > That and initialization to 0 in ceph_create_client(), so how does this > != 0 check affect anything? Won't we always return false in that case, > thus preserving the old behavior? > > > > > > > } > > > > > > > > enum calc_target_result { > > > > @@ -1610,21 +1612,24 @@ static void send_request(struct ceph_osd_request *req) > > > > static void maybe_request_map(struct ceph_osd_client *osdc) > > > > { > > > > bool continuous = false; > > > > + u32 epoch = osdc->osdmap->epoch; > > > > > > > > verify_osdc_locked(osdc); > > > > - WARN_ON(!osdc->osdmap->epoch); > > > > + WARN_ON_ONCE(epoch == 0); > > > > > > > > if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || > > > > ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD) || > > > > - ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR)) { > > > > + ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR) || > > > > + (osdc->epoch_barrier && epoch < osdc->epoch_barrier)) { > > > > dout("%s osdc %p continuous\n", __func__, osdc); > > > > continuous = true; > > > > } else { > > > > dout("%s osdc %p onetime\n", __func__, osdc); > > > > } > > > > > > > > + ++epoch; > > > > if (ceph_monc_want_map(&osdc->client->monc, CEPH_SUB_OSDMAP, > > > > - osdc->osdmap->epoch + 1, continuous)) > > > > + epoch, continuous)) > > > > ceph_monc_renew_subs(&osdc->client->monc); > > > > } > > > > > > Why was this change needed? Wouldn't the existing call to unmodified > > > maybe_request_map() from ceph_osdc_handle_map() be sufficient? > > > > > > > It's not strictly required, but it's more efficient to fetch the epoch > > at the beginning of the function like this and then work with it the > > value on the stack than to potentially deal with having to refetch it. > > It also looks cleaner. > > It was more about the if condition change. I don't see a reason for it > and it's not present in the Objecter counterpart, so I'd rather we drop > the entire hunk. > To be clear, there is no functional difference here. epoch == osdc->osdmap->epoch, and we don't use "epoch" after that point. I'll leave the code as-is per your wish, but I don't think it makes any substantive difference here.
On Wed, Mar 29, 2017 at 7:43 PM, Jeff Layton <jlayton@redhat.com> wrote: > On Wed, 2017-03-29 at 18:52 +0200, Ilya Dryomov wrote: >> On Wed, Mar 29, 2017 at 1:54 PM, Jeff Layton <jlayton@redhat.com> wrote: >> > On Tue, 2017-03-28 at 16:54 +0200, Ilya Dryomov wrote: >> > > On Sat, Feb 25, 2017 at 6:43 PM, Jeff Layton <jlayton@redhat.com> wrote: >> > > > Cephfs can get cap update requests that contain a new epoch barrier in >> > > > them. When that happens we want to pause all OSD traffic until the right >> > > > map epoch arrives. >> > > > >> > > > Add an epoch_barrier field to ceph_osd_client that is protected by the >> > > > osdc->lock rwsem. When the barrier is set, and the current OSD map >> > > > epoch is below that, pause the request target when submitting the >> > > > request or when revisiting it. Add a way for upper layers (cephfs) >> > > > to update the epoch_barrier as well. >> > > > >> > > > If we get a new map, compare the new epoch against the barrier before >> > > > kicking requests and request another map if the map epoch is still lower >> > > > than the one we want. >> > > > >> > > > If we end up cancelling requests because of a new map showing a full OSD >> > > > or pool condition, then set the barrier higher than the highest replay >> > > > epoch of all the cancelled requests. >> > > > >> > > > Signed-off-by: Jeff Layton <jlayton@redhat.com> >> > > > --- >> > > > include/linux/ceph/osd_client.h | 2 ++ >> > > > net/ceph/osd_client.c | 51 +++++++++++++++++++++++++++++++++-------- >> > > > 2 files changed, 43 insertions(+), 10 deletions(-) >> > > > >> > > > diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h >> > > > index 55dcff2455e0..833942226560 100644 >> > > > --- a/include/linux/ceph/osd_client.h >> > > > +++ b/include/linux/ceph/osd_client.h >> > > > @@ -266,6 +266,7 @@ struct ceph_osd_client { >> > > > struct rb_root osds; /* osds */ >> > > > struct list_head osd_lru; /* idle osds */ >> > > > spinlock_t osd_lru_lock; >> > > > + u32 epoch_barrier; >> > > >> > > It would be good to include it in debugfs -- osdmap_show(). >> > > >> > >> > Ok, I'll plan to add it in there. >> > >> > > > struct ceph_osd homeless_osd; >> > > > atomic64_t last_tid; /* tid of last request */ >> > > > u64 last_linger_id; >> > > > @@ -304,6 +305,7 @@ extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc, >> > > > struct ceph_msg *msg); >> > > > extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc, >> > > > struct ceph_msg *msg); >> > > > +void ceph_osdc_update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb); >> > > > >> > > > extern void osd_req_op_init(struct ceph_osd_request *osd_req, >> > > > unsigned int which, u16 opcode, u32 flags); >> > > > diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c >> > > > index b286ff6dec29..2e9b6211814a 100644 >> > > > --- a/net/ceph/osd_client.c >> > > > +++ b/net/ceph/osd_client.c >> > > > @@ -1299,8 +1299,10 @@ static bool target_should_be_paused(struct ceph_osd_client *osdc, >> > > > __pool_full(pi); >> > > > >> > > > WARN_ON(pi->id != t->base_oloc.pool); >> > > > - return (t->flags & CEPH_OSD_FLAG_READ && pauserd) || >> > > > - (t->flags & CEPH_OSD_FLAG_WRITE && pausewr); >> > > > + return ((t->flags & CEPH_OSD_FLAG_READ) && pauserd) || >> > > > + ((t->flags & CEPH_OSD_FLAG_WRITE) && pausewr) || >> > > > + (osdc->epoch_barrier && >> > > > + osdc->osdmap->epoch < osdc->epoch_barrier); >> > > >> > > Why the osdc->epoch_barrier != 0 check, here and everywhere else? >> > > >> > >> > My understanding is that we have to deal with clusters that predate the >> > addition of this value. The client sets this to 0 when it's not set. >> >> That and initialization to 0 in ceph_create_client(), so how does this >> != 0 check affect anything? Won't we always return false in that case, >> thus preserving the old behavior? >> >> > >> > > > } >> > > > >> > > > enum calc_target_result { >> > > > @@ -1610,21 +1612,24 @@ static void send_request(struct ceph_osd_request *req) >> > > > static void maybe_request_map(struct ceph_osd_client *osdc) >> > > > { >> > > > bool continuous = false; >> > > > + u32 epoch = osdc->osdmap->epoch; >> > > > >> > > > verify_osdc_locked(osdc); >> > > > - WARN_ON(!osdc->osdmap->epoch); >> > > > + WARN_ON_ONCE(epoch == 0); >> > > > >> > > > if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || >> > > > ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD) || >> > > > - ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR)) { >> > > > + ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR) || >> > > > + (osdc->epoch_barrier && epoch < osdc->epoch_barrier)) { >> > > > dout("%s osdc %p continuous\n", __func__, osdc); >> > > > continuous = true; >> > > > } else { >> > > > dout("%s osdc %p onetime\n", __func__, osdc); >> > > > } >> > > > >> > > > + ++epoch; >> > > > if (ceph_monc_want_map(&osdc->client->monc, CEPH_SUB_OSDMAP, >> > > > - osdc->osdmap->epoch + 1, continuous)) >> > > > + epoch, continuous)) >> > > > ceph_monc_renew_subs(&osdc->client->monc); >> > > > } >> > > >> > > Why was this change needed? Wouldn't the existing call to unmodified >> > > maybe_request_map() from ceph_osdc_handle_map() be sufficient? >> > > >> > >> > It's not strictly required, but it's more efficient to fetch the epoch >> > at the beginning of the function like this and then work with it the >> > value on the stack than to potentially deal with having to refetch it. >> > It also looks cleaner. >> >> It was more about the if condition change. I don't see a reason for it >> and it's not present in the Objecter counterpart, so I'd rather we drop >> the entire hunk. >> > > To be clear, there is no functional difference here. > > epoch == osdc->osdmap->epoch, and we don't use "epoch" after that > point. I'll leave the code as-is per your wish, but I don't think it > makes any substantive difference here. I was talking about this bit: > - ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR)) { > + ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR) || > + (osdc->epoch_barrier && epoch < osdc->epoch_barrier)) { It is a functional change, although definitely not substantive. Thanks, Ilya -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 55dcff2455e0..833942226560 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -266,6 +266,7 @@ struct ceph_osd_client { struct rb_root osds; /* osds */ struct list_head osd_lru; /* idle osds */ spinlock_t osd_lru_lock; + u32 epoch_barrier; struct ceph_osd homeless_osd; atomic64_t last_tid; /* tid of last request */ u64 last_linger_id; @@ -304,6 +305,7 @@ extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg); extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg); +void ceph_osdc_update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb); extern void osd_req_op_init(struct ceph_osd_request *osd_req, unsigned int which, u16 opcode, u32 flags); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index b286ff6dec29..2e9b6211814a 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1299,8 +1299,10 @@ static bool target_should_be_paused(struct ceph_osd_client *osdc, __pool_full(pi); WARN_ON(pi->id != t->base_oloc.pool); - return (t->flags & CEPH_OSD_FLAG_READ && pauserd) || - (t->flags & CEPH_OSD_FLAG_WRITE && pausewr); + return ((t->flags & CEPH_OSD_FLAG_READ) && pauserd) || + ((t->flags & CEPH_OSD_FLAG_WRITE) && pausewr) || + (osdc->epoch_barrier && + osdc->osdmap->epoch < osdc->epoch_barrier); } enum calc_target_result { @@ -1610,21 +1612,24 @@ static void send_request(struct ceph_osd_request *req) static void maybe_request_map(struct ceph_osd_client *osdc) { bool continuous = false; + u32 epoch = osdc->osdmap->epoch; verify_osdc_locked(osdc); - WARN_ON(!osdc->osdmap->epoch); + WARN_ON_ONCE(epoch == 0); if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD) || - ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR)) { + ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR) || + (osdc->epoch_barrier && epoch < osdc->epoch_barrier)) { dout("%s osdc %p continuous\n", __func__, osdc); continuous = true; } else { dout("%s osdc %p onetime\n", __func__, osdc); } + ++epoch; if (ceph_monc_want_map(&osdc->client->monc, CEPH_SUB_OSDMAP, - osdc->osdmap->epoch + 1, continuous)) + epoch, continuous)) ceph_monc_renew_subs(&osdc->client->monc); } @@ -1653,8 +1658,14 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked) goto promote; } - if ((req->r_flags & CEPH_OSD_FLAG_WRITE) && - ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR)) { + if (osdc->epoch_barrier && + osdc->osdmap->epoch < osdc->epoch_barrier) { + dout("req %p epoch %u barrier %u\n", req, osdc->osdmap->epoch, + osdc->epoch_barrier); + req->r_t.paused = true; + maybe_request_map(osdc); + } else if ((req->r_flags & CEPH_OSD_FLAG_WRITE) && + ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR)) { dout("req %p pausewr\n", req); req->r_t.paused = true; maybe_request_map(osdc); @@ -1772,7 +1783,8 @@ static void complete_request(struct ceph_osd_request *req, int err) /* * Drop all pending requests that are stalled waiting on a full condition to - * clear, and complete them with ENOSPC as the return code. + * clear, and complete them with ENOSPC as the return code. Set the + * osdc->epoch_barrier to the latest replay version epoch that was aborted. */ static void ceph_osdc_abort_on_full(struct ceph_osd_client *osdc) { @@ -1808,7 +1820,11 @@ static void ceph_osdc_abort_on_full(struct ceph_osd_client *osdc) mutex_unlock(&osd->lock); } out: - dout("return abort_on_full latest_epoch=%u\n", latest_epoch); + if (latest_epoch) + osdc->epoch_barrier = max(latest_epoch + 1, + osdc->epoch_barrier); + dout("return abort_on_full latest_epoch=%u barrier=%u\n", latest_epoch, + osdc->epoch_barrier); } static void cancel_map_check(struct ceph_osd_request *req) @@ -3266,7 +3282,8 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) pausewr = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR) || ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || have_pool_full(osdc); - if (was_pauserd || was_pausewr || pauserd || pausewr) + if (was_pauserd || was_pausewr || pauserd || pausewr || + (osdc->epoch_barrier && osdc->osdmap->epoch < osdc->epoch_barrier)) maybe_request_map(osdc); kick_requests(osdc, &need_resend, &need_resend_linger); @@ -3284,6 +3301,20 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) up_write(&osdc->lock); } +void ceph_osdc_update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb) +{ + down_read(&osdc->lock); + if (unlikely(eb > osdc->epoch_barrier)) { + up_read(&osdc->lock); + down_write(&osdc->lock); + osdc->epoch_barrier = max(eb, osdc->epoch_barrier); + up_write(&osdc->lock); + } else { + up_read(&osdc->lock); + } +} +EXPORT_SYMBOL(ceph_osdc_update_epoch_barrier); + /* * Resubmit requests pending on the given osd. */
Cephfs can get cap update requests that contain a new epoch barrier in them. When that happens we want to pause all OSD traffic until the right map epoch arrives. Add an epoch_barrier field to ceph_osd_client that is protected by the osdc->lock rwsem. When the barrier is set, and the current OSD map epoch is below that, pause the request target when submitting the request or when revisiting it. Add a way for upper layers (cephfs) to update the epoch_barrier as well. If we get a new map, compare the new epoch against the barrier before kicking requests and request another map if the map epoch is still lower than the one we want. If we end up cancelling requests because of a new map showing a full OSD or pool condition, then set the barrier higher than the highest replay epoch of all the cancelled requests. Signed-off-by: Jeff Layton <jlayton@redhat.com> --- include/linux/ceph/osd_client.h | 2 ++ net/ceph/osd_client.c | 51 +++++++++++++++++++++++++++++++++-------- 2 files changed, 43 insertions(+), 10 deletions(-)