@@ -1906,7 +1906,13 @@ struct cl_io {
/**
* Set if IO is triggered by async workqueue readahead.
*/
- ci_async_readahead:1;
+ ci_async_readahead:1,
+ /**
+ * Set if we've tried all mirrors for this read IO, if it's not set,
+ * the read IO will check to-be-read OSCs' status, and make fast-switch
+ * another mirror if some of the OSTs are not healthy.
+ */
+ ci_tried_all_mirrors:1;
/**
* How many times the read has retried before this one.
* Set by the top level and consumed by the LOV.
@@ -140,6 +140,7 @@ static int lov_io_sub_init(const struct lu_env *env, struct lov_io *lio,
sub_io->ci_lock_no_expand = io->ci_lock_no_expand;
sub_io->ci_ndelay = io->ci_ndelay;
sub_io->ci_layout_version = io->ci_layout_version;
+ sub_io->ci_tried_all_mirrors = io->ci_tried_all_mirrors;
rc = cl_io_sub_init(sub->sub_env, sub_io, io->ci_type, sub_obj);
if (rc < 0)
@@ -395,13 +396,13 @@ static int lov_io_mirror_init(struct lov_io *lio, struct lov_object *obj,
found = true;
break;
}
- }
-
+ } /* each component of the mirror */
if (found) {
index = (index + i) % comp->lo_mirror_count;
break;
}
- }
+ } /* each mirror */
+
if (i == comp->lo_mirror_count) {
CERROR(DFID ": failed to find a component covering I/O region at %llu\n",
PFID(lu_object_fid(lov2lu(obj))), lio->lis_pos);
@@ -423,16 +424,21 @@ static int lov_io_mirror_init(struct lov_io *lio, struct lov_object *obj,
* of this client has been partitioned. We should relinquish CPU for
* a while before trying again.
*/
- ++io->ci_ndelay_tried;
- if (io->ci_ndelay && io->ci_ndelay_tried >= comp->lo_mirror_count) {
- set_current_state(TASK_INTERRUPTIBLE);
- schedule_timeout(msecs_to_jiffies(MSEC_PER_SEC)); /* 10ms */
+ if (io->ci_ndelay && io->ci_ndelay_tried > 0 &&
+ (io->ci_ndelay_tried % comp->lo_mirror_count == 0)) {
+ schedule_timeout_interruptible(HZ / 100 + 1); /* 10ms */
if (signal_pending(current))
return -EINTR;
- /* reset retry counter */
- io->ci_ndelay_tried = 1;
+ /**
+ * we'd set ci_tried_all_mirrors to turn off fast mirror
+ * switching for read after we've tried all mirrors several
+ * rounds.
+ */
+ io->ci_tried_all_mirrors = io->ci_ndelay_tried %
+ (comp->lo_mirror_count * 4) == 0;
}
+ ++io->ci_ndelay_tried;
CDEBUG(D_VFSTRACE, "use %sdelayed RPC state for this IO\n",
io->ci_ndelay ? "non-" : "");
@@ -668,6 +674,7 @@ static void lov_io_sub_inherit(struct lov_io_sub *sub, struct lov_io *lio,
case CIT_READ:
case CIT_WRITE: {
io->u.ci_wr.wr_sync = cl_io_is_sync_write(parent);
+ io->ci_tried_all_mirrors = parent->ci_tried_all_mirrors;
if (cl_io_is_append(parent)) {
io->u.ci_wr.wr_append = 1;
} else {
@@ -368,6 +368,13 @@ int osc_io_commit_async(const struct lu_env *env,
}
EXPORT_SYMBOL(osc_io_commit_async);
+static bool osc_import_not_healthy(struct obd_import *imp)
+{
+ return imp->imp_invalid || imp->imp_deactive ||
+ !(imp->imp_state == LUSTRE_IMP_FULL ||
+ imp->imp_state == LUSTRE_IMP_IDLE);
+}
+
int osc_io_iter_init(const struct lu_env *env, const struct cl_io_slice *ios)
{
struct osc_object *osc = cl2osc(ios->cis_obj);
@@ -376,7 +383,14 @@ int osc_io_iter_init(const struct lu_env *env, const struct cl_io_slice *ios)
int rc = -EIO;
spin_lock(&imp->imp_lock);
- if (likely(!imp->imp_invalid)) {
+ /**
+ * check whether this OSC device is available for non-delay read,
+ * fast switching mirror if we haven't tried all mirrors.
+ */
+ if (ios->cis_io->ci_type == CIT_READ && ios->cis_io->ci_ndelay &&
+ !ios->cis_io->ci_tried_all_mirrors && osc_import_not_healthy(imp)) {
+ rc = -EWOULDBLOCK;
+ } else if (likely(!imp->imp_invalid)) {
atomic_inc(&osc->oo_nr_ios);
oio->oi_is_active = 1;
rc = 0;