@@ -583,6 +583,16 @@ OPTION(osd_client_op_priority, OPT_U32, 63)
OPTION(osd_recovery_op_priority, OPT_U32, 10)
OPTION(osd_recovery_op_warn_multiple, OPT_U32, 16)
+// Removal of PGs is done in background, but if the osd is restarted,
+// it will finish all pending removals before joining the cluster.
+// This can take a while. If this option is set to true, then pending
+// removals will be performed in background, while the osd runs
+// normally. This is a bit dangerous if the OSD gets a new copy of
+// the PG before the pending removal is completed: attributes stored
+// in the leveldb may be lost when removal cleans up an object's
+// attributes AFTER the new object is backfilled.
+OPTION(osd_startup_finish_remove_in_background, OPT_BOOL, false)
+
// Max time to wait between notifying mon of shutdown and shutting down
OPTION(osd_mon_shutdown_timeout, OPT_DOUBLE, 5)
@@ -2048,6 +2048,8 @@ void OSD::load_pgs()
set<spg_t> head_pgs;
map<spg_t, interval_set<snapid_t> > pgs;
+ typedef map<uint64_t, pair<spg_t, coll_t> > bgremove_t;
+ bgremove_t *bgremove = NULL;
bool flush = false;
for (vector<coll_t>::iterator it = ls.begin();
it != ls.end();
@@ -2056,14 +2058,30 @@ void OSD::load_pgs()
snapid_t snap;
uint64_t seq;
- if (it->is_temp(pgid) ||
- it->is_removal(&seq, &pgid)) {
- dout(10) << "load_pgs " << *it << " clearing temp" << dendl;
+ if (it->is_temp(pgid)) {
+ dout(10) << "load_pgs " << *it << " clearing temp " << dendl;
recursive_remove_collection(store, *it, false);
flush = true;
continue;
}
+ if (it->is_removal(&seq, &pgid)) {
+ if (cct->_conf->osd_startup_finish_remove_in_background) {
+ dout(10) << "load_pgs " << *it
+ << " delaying pending removal" << dendl;
+ if (seq >= next_removal_seq)
+ next_removal_seq = seq + 1;
+ if (!bgremove)
+ bgremove = new bgremove_t();
+ (*bgremove)[seq] = make_pair(pgid, *it);
+ } else {
+ dout(10) << "load_pgs " << *it << " clearing pending removal " << dendl;
+ recursive_remove_collection(store, *it, false);
+ flush = true;
+ }
+ continue;
+ }
+
if (it->is_pg(pgid, snap)) {
if (snap != CEPH_NOSNAP) {
dout(10) << "load_pgs skipping snapped dir " << *it
@@ -2081,6 +2099,18 @@ void OSD::load_pgs()
if (flush)
store->sync_and_flush();
+ if (bgremove) {
+ for (bgremove_t::iterator it = bgremove->begin();
+ it != bgremove->end(); it++) {
+ dout(10) << "load_pgs FORREMOVAL_" << it->first << "_" << it->second
+ << " scheduling background removal " << dendl;
+ DeletingStateRef deleting = service.deleting_pgs.lookup_or_create
+ (it->second.first, make_pair(it->second.first, it->second.second));
+ remove_wq.queue(make_pair(PGRef(0), deleting));
+ }
+ delete bgremove;
+ }
+
bool has_upgraded = false;
for (map<spg_t, interval_set<snapid_t> >::iterator i = pgs.begin();
i != pgs.end();
@@ -3519,6 +3549,22 @@ void OSD::RemoveWQ::_process(
pair<PGRef, DeletingStateRef> item,
ThreadPool::TPHandle &handle)
{
+ if (!item.second->resurrectable_p()) {
+ // this is for background live removal of pending FORREMOVAL pgs,
+ // remaining from earlier OSD sessions. This only happens if
+ // osd_startup_finish_remove_in_background is enabled.
+ if (!item.second->start_clearing())
+ return;
+
+ if (!item.second->start_deleting())
+ return;
+
+ recursive_remove_collection(store, item.second->get_coll (), false);
+
+ item.second->finish_deleting();
+ return;
+ }
+
PGRef pg(item.first);
SnapMapper &mapper = pg->snap_mapper;
OSDriver &driver = pg->osdriver;
@@ -217,9 +217,21 @@ class DeletingState {
public:
const spg_t pgid;
const PGRef old_pg_state;
+ const coll_t old_coll; // iff old_pg_state is NULL
DeletingState(const pair<spg_t, PGRef> &in) :
lock("DeletingState::lock"), status(QUEUED), stop_deleting(false),
- pgid(in.first), old_pg_state(in.second) {}
+ pgid(in.first), old_pg_state(in.second), old_coll() {}
+ DeletingState(const pair<spg_t, coll_t> &in) :
+ lock("DeletingState::lock"), status(QUEUED), stop_deleting(false),
+ pgid(in.first), old_pg_state(NULL), old_coll(in.second) {}
+
+ bool resurrectable_p() const {
+ return !!old_pg_state;
+ }
+ coll_t get_coll() const {
+ assert(!resurrectable_p());
+ return old_coll;
+ }
/// transition status to clearing
bool start_clearing() {
@@ -286,7 +298,8 @@ public:
/// try to halt the deletion
bool try_stop_deletion() {
Mutex::Locker l(lock);
- stop_deleting = true;
+ if (resurrectable_p())
+ stop_deleting = true;
/**
* If we are in DELETING_DIR or CLEARING_DIR, there are in progress
* operations we have to wait for before continuing on. States