@@ -1775,6 +1775,8 @@ struct cl_io {
struct cl_lockset ci_lockset;
/** lock requirements, this is just a help info for sublayers. */
enum cl_io_lock_dmd ci_lockreq;
+ /** layout version when this IO occurs */
+ u32 ci_layout_version;
union {
struct cl_rd_io {
struct cl_io_rw_common rd;
@@ -1850,8 +1852,10 @@ struct cl_io {
*/
ci_ignore_layout:1,
/**
- * Need MDS intervention to complete a write. This usually means the
- * corresponding component is not initialized for the writing extent.
+ * Need MDS intervention to complete a write.
+ * Write intent is required for the following cases:
+ * 1. component being written is not initialized, or
+ * 2. the mirrored files are NOT in WRITE_PENDING state.
*/
ci_need_write_intent:1,
/**
@@ -586,8 +586,9 @@ int osc_teardown_async_page(const struct lu_env *env, struct osc_object *obj,
struct osc_page *ops);
int osc_flush_async_page(const struct lu_env *env, struct cl_io *io,
struct osc_page *ops);
-int osc_queue_sync_pages(const struct lu_env *env, struct osc_object *obj,
- struct list_head *list, int brw_flags);
+int osc_queue_sync_pages(const struct lu_env *env, const struct cl_io *io,
+ struct osc_object *obj, struct list_head *list,
+ int brw_flags);
int osc_cache_truncate_start(const struct lu_env *env, struct osc_object *obj,
u64 size, struct osc_extent **extp);
void osc_cache_truncate_end(const struct lu_env *env, struct osc_extent *ext);
@@ -968,6 +969,8 @@ struct osc_extent {
int oe_rc;
/* max pages per rpc when this extent was created */
unsigned int oe_mppr;
+ /* FLR: layout version when this osc_extent is publised */
+ u32 oe_layout_version;
};
/* @} osc */
@@ -687,9 +687,10 @@ static inline int it_to_lock_mode(struct lookup_intent *it)
/* CREAT needs to be tested before open (both could be set) */
if (it->it_op & IT_CREAT)
return LCK_CW;
- else if (it->it_op & (IT_GETATTR | IT_OPEN | IT_LOOKUP |
- IT_LAYOUT))
+ else if (it->it_op & (IT_GETATTR | IT_OPEN | IT_LOOKUP))
return LCK_CR;
+ else if (it->it_op & IT_LAYOUT)
+ return (it->it_flags & FMODE_WRITE) ? LCK_EX : LCK_CR;
else if (it->it_op & IT_READDIR)
return LCK_PR;
else if (it->it_op & IT_GETXATTR)
@@ -476,6 +476,8 @@
/* FLR */
#define OBD_FAIL_FLR_GLIMPSE_IMMUTABLE 0x1A00
+#define OBD_FAIL_FLR_LV_DELAY 0x1A01
+#define OBD_FAIL_FLR_LV_INC 0x1A02
/* Assign references to moved code to reduce code changes */
#define OBD_FAIL_PRECHECK(id) CFS_FAIL_PRECHECK(id)
@@ -4206,8 +4206,8 @@ int ll_layout_write_intent(struct inode *inode, u64 start, u64 end)
{
struct layout_intent intent = {
.li_opc = LAYOUT_INTENT_WRITE,
- .li_start = start,
- .li_end = end,
+ .li_extent.e_start = start,
+ .li_extent.e_end = end,
};
int rc;
@@ -165,6 +165,13 @@ static int vvp_prune(const struct lu_env *env, struct cl_object *obj)
}
truncate_inode_pages(inode->i_mapping, 0);
+ if (inode->i_mapping->nrpages) {
+ CDEBUG(D_VFSTRACE, DFID ": still has %lu pages remaining\n",
+ PFID(lu_object_fid(&obj->co_lu)),
+ inode->i_mapping->nrpages);
+ return -EIO;
+ }
+
return 0;
}
@@ -230,6 +230,7 @@ struct lov_layout_entry {
struct lov_mirror_entry {
unsigned short lre_mirror_id;
unsigned short lre_preferred:1,
+ lre_stale:1, /* set if any components is stale */
lre_valid:1; /* set if at least one of components
* in this mirror is valid
*/
@@ -438,6 +439,8 @@ struct lov_page {
struct cl_page_slice lps_cl;
/** layout_entry + stripe index, composed using lov_comp_index() */
unsigned int lps_index;
+ /* the layout gen when this page was created */
+ u32 lps_layout_gen;
};
/*
@@ -136,6 +136,7 @@ static int lov_io_sub_init(const struct lu_env *env, struct lov_io *lio,
sub_io->ci_noatime = io->ci_noatime;
sub_io->ci_lock_no_expand = io->ci_lock_no_expand;
sub_io->ci_ndelay = io->ci_ndelay;
+ sub_io->ci_layout_version = io->ci_layout_version;
rc = cl_io_sub_init(sub->sub_env, sub_io, io->ci_type, sub_obj);
if (rc < 0)
@@ -208,12 +209,88 @@ static int lov_io_subio_init(const struct lu_env *env, struct lov_io *lio,
return 0;
}
+/**
+ * Decide if it will need write intent RPC
+ */
+static int lov_io_mirror_write_intent(struct lov_io *lio,
+ struct lov_object *obj, struct cl_io *io)
+{
+ struct lov_layout_composite *comp = &obj->u.composite;
+ struct lu_extent *ext = &io->ci_write_intent;
+ struct lov_mirror_entry *lre;
+ struct lov_mirror_entry *primary;
+ struct lov_layout_entry *lle;
+ size_t count = 0;
+
+ *ext = (typeof(*ext)) { lio->lis_pos, lio->lis_endpos };
+ io->ci_need_write_intent = 0;
+
+ if (!(io->ci_type == CIT_WRITE || cl_io_is_trunc(io) ||
+ cl_io_is_mkwrite(io)))
+ return 0;
+
+ if (lov_flr_state(obj) == LCM_FL_RDONLY ||
+ lov_flr_state(obj) == LCM_FL_SYNC_PENDING) {
+ io->ci_need_write_intent = 1;
+ return 0;
+ }
+
+ LASSERT((lov_flr_state(obj) == LCM_FL_WRITE_PENDING));
+ LASSERT(comp->lo_preferred_mirror >= 0);
+
+ /* need to iterate all components to see if there are
+ * multiple components covering the writing component
+ */
+ primary = &comp->lo_mirrors[comp->lo_preferred_mirror];
+ LASSERT(!primary->lre_stale);
+ lov_foreach_mirror_layout_entry(obj, lle, primary) {
+ LASSERT(lle->lle_valid);
+ if (!lu_extent_is_overlapped(ext, lle->lle_extent))
+ continue;
+
+ ext->e_start = min(ext->e_start, lle->lle_extent->e_start);
+ ext->e_end = max(ext->e_end, lle->lle_extent->e_end);
+ ++count;
+ }
+ if (count == 0) {
+ CERROR(DFID ": cannot find any valid components covering file extent " DEXT ", mirror: %d\n",
+ PFID(lu_object_fid(lov2lu(obj))), PEXT(ext),
+ primary->lre_mirror_id);
+ return -EIO;
+ }
+
+ count = 0;
+ lov_foreach_mirror_entry(obj, lre) {
+ if (lre == primary)
+ continue;
+
+ lov_foreach_mirror_layout_entry(obj, lle, lre) {
+ if (!lle->lle_valid)
+ continue;
+
+ if (lu_extent_is_overlapped(ext, lle->lle_extent)) {
+ ++count;
+ break;
+ }
+ }
+ }
+
+ CDEBUG(D_VFSTRACE,
+ DFID "there are %zd components to be staled to modify file extent " DEXT ", iot: %d\n",
+ PFID(lu_object_fid(lov2lu(obj))), count, PEXT(ext), io->ci_type);
+
+ io->ci_need_write_intent = count > 0;
+
+ return 0;
+}
+
static int lov_io_mirror_init(struct lov_io *lio, struct lov_object *obj,
struct cl_io *io)
{
struct lov_layout_composite *comp = &obj->u.composite;
int index;
int i;
+ int result;
if (!lov_is_flr(obj)) {
LASSERT(comp->lo_preferred_mirror == 0);
@@ -222,6 +299,22 @@ static int lov_io_mirror_init(struct lov_io *lio, struct lov_object *obj,
return 0;
}
+ result = lov_io_mirror_write_intent(lio, obj, io);
+ if (result)
+ return result;
+
+ if (io->ci_need_write_intent) {
+ CDEBUG(D_VFSTRACE, DFID " need write intent for [%llu, %llu)\n",
+ PFID(lu_object_fid(lov2lu(obj))),
+ lio->lis_pos, lio->lis_endpos);
+
+ /* stop cl_io_init() loop */
+ return 1;
+ }
+
+ /* transfer the layout version for verification */
+ io->ci_layout_version = obj->lo_lsm->lsm_layout_gen;
+
if (io->ci_ndelay_tried == 0 || /* first time to try */
/* reset the mirror index if layout has changed */
lio->lis_mirror_layout_gen != obj->lo_lsm->lsm_layout_gen) {
@@ -325,8 +418,10 @@ static int lov_io_slice_init(struct lov_io *lio, struct lov_object *obj,
* the current file-tail exactly.
*/
if (unlikely(obj->lo_lsm->lsm_entries[0]->lsme_pattern &
- LOV_PATTERN_F_HOLE))
- return -EIO;
+ LOV_PATTERN_F_HOLE)) {
+ result = -EIO;
+ goto out;
+ }
lio->lis_pos = 0;
lio->lis_endpos = OBD_OBJECT_EOF;
@@ -371,8 +466,11 @@ static int lov_io_slice_init(struct lov_io *lio, struct lov_object *obj,
lio->lis_endpos = OBD_OBJECT_EOF;
if (lov_flr_state(obj) == LCM_FL_RDONLY &&
- !OBD_FAIL_CHECK(OBD_FAIL_FLR_GLIMPSE_IMMUTABLE))
- return 1; /* SoM is accurate, no need glimpse */
+ !OBD_FAIL_CHECK(OBD_FAIL_FLR_GLIMPSE_IMMUTABLE)) {
+ /* SoM is accurate, no need glimpse */
+ result = 1;
+ goto out;
+ }
break;
case CIT_MISC:
@@ -385,12 +483,14 @@ static int lov_io_slice_init(struct lov_io *lio, struct lov_object *obj,
}
result = lov_io_mirror_init(lio, obj, io);
if (result)
- return result;
+ goto out;
/* check if it needs to instantiate layout */
if (!(io->ci_type == CIT_WRITE || cl_io_is_mkwrite(io) ||
- (cl_io_is_trunc(io) && io->u.ci_setattr.sa_attr.lvb_size > 0)))
- return 0;
+ (cl_io_is_trunc(io) && io->u.ci_setattr.sa_attr.lvb_size > 0))) {
+ result = 0;
+ goto out;
+ }
ext.e_start = lio->lis_pos;
ext.e_end = lio->lis_endpos;
@@ -409,10 +509,11 @@ static int lov_io_slice_init(struct lov_io *lio, struct lov_object *obj,
io->ci_need_write_intent = 1;
io->ci_write_intent = ext;
result = 1;
- break;
+ goto out;
}
}
+out:
return result;
}
@@ -799,6 +900,10 @@ static int lov_io_read_ahead(const struct lu_env *env,
if (index < 0 || !lsm_entry_inited(loo->lo_lsm, index))
return -ENODATA;
+ /* avoid readahead to expand to stale components */
+ if (!lov_entry(loo, index)->lle_valid)
+ return -EIO;
+
stripe = lov_stripe_number(loo->lo_lsm, index, offset);
r0 = lov_r0(loo, index);
@@ -675,6 +675,7 @@ static int lov_init_composite(const struct lu_env *env, struct lov_device *dev,
if (i > 0) {
if (mirror_id == lre->lre_mirror_id) {
lre->lre_valid |= lle->lle_valid;
+ lre->lre_stale |= !lle->lle_valid;
lre->lre_end = i;
continue;
}
@@ -696,6 +697,7 @@ static int lov_init_composite(const struct lu_env *env, struct lov_device *dev,
lre->lre_preferred = (lle->lle_lsme->lsme_flags &
LCME_FL_PREFERRED);
lre->lre_valid = lle->lle_valid;
+ lre->lre_stale = !lle->lle_valid;
}
/* sanity check for FLR */
@@ -737,7 +739,7 @@ static int lov_init_composite(const struct lu_env *env, struct lov_device *dev,
i = 0;
lov_foreach_mirror_entry(lov, lre) {
i++;
- if (!lre->lre_valid)
+ if (lre->lre_stale)
continue;
mirror_count++; /* valid mirror */
@@ -57,8 +57,8 @@ static int lov_comp_page_print(const struct lu_env *env,
struct lov_page *lp = cl2lov_page(slice);
return (*printer)(env, cookie,
- LUSTRE_LOV_NAME "-page@%p, comp index: %x\n",
- lp, lp->lps_index);
+ LUSTRE_LOV_NAME "-page@%p, comp index: %x, gen: %u\n",
+ lp, lp->lps_index, lp->lps_layout_gen);
}
static const struct cl_page_operations lov_comp_page_ops = {
@@ -96,6 +96,7 @@ int lov_page_init_composite(const struct lu_env *env, struct cl_object *obj,
LASSERT(rc == 0);
lpg->lps_index = lov_comp_index(entry, stripe);
+ lpg->lps_layout_gen = loo->lo_lsm->lsm_layout_gen;
cl_page_slice_add(page, &lpg->lps_cl, obj, index, &lov_comp_page_ops);
sub = lov_sub_get(env, lio, lpg->lps_index);
@@ -2479,6 +2479,9 @@ int osc_queue_async_io(const struct lu_env *env, struct cl_io *io,
++ext->oe_nr_pages;
list_add_tail(&oap->oap_pending_item, &ext->oe_pages);
osc_object_unlock(osc);
+
+ if (!ext->oe_layout_version)
+ ext->oe_layout_version = io->ci_layout_version;
}
return rc;
@@ -2604,8 +2607,9 @@ int osc_flush_async_page(const struct lu_env *env, struct cl_io *io,
return rc;
}
-int osc_queue_sync_pages(const struct lu_env *env, struct osc_object *obj,
- struct list_head *list, int brw_flags)
+int osc_queue_sync_pages(const struct lu_env *env, const struct cl_io *io,
+ struct osc_object *obj, struct list_head *list,
+ int brw_flags)
{
struct client_obd *cli = osc_cli(obj);
struct osc_extent *ext;
@@ -2656,6 +2660,7 @@ int osc_queue_sync_pages(const struct lu_env *env, struct osc_object *obj,
ext->oe_nr_pages = page_count;
ext->oe_mppr = mppr;
list_splice_init(list, &ext->oe_pages);
+ ext->oe_layout_version = io->ci_layout_version;
osc_object_lock(obj);
/* Reuse the initial refcount for RPC, don't drop it */
@@ -188,7 +188,7 @@ int osc_io_submit(const struct lu_env *env, const struct cl_io_slice *ios,
if (++queued == max_pages) {
queued = 0;
- result = osc_queue_sync_pages(env, osc, &list,
+ result = osc_queue_sync_pages(env, io, osc, &list,
brw_flags);
if (result < 0)
break;
@@ -196,7 +196,7 @@ int osc_io_submit(const struct lu_env *env, const struct cl_io_slice *ios,
}
if (queued > 0)
- result = osc_queue_sync_pages(env, osc, &list, brw_flags);
+ result = osc_queue_sync_pages(env, io, osc, &list, brw_flags);
/* Update c/mtime for sync write. LU-7310 */
if (crt == CRT_WRITE && qout->pl_nr > 0 && !result) {
@@ -558,6 +558,12 @@ static int osc_io_setattr_start(const struct lu_env *env,
oa->o_flags = OBD_FL_SRVLOCK;
oa->o_valid |= OBD_MD_FLFLAGS;
}
+
+ if (io->ci_layout_version > 0) {
+ /* verify layout version */
+ oa->o_valid |= OBD_MD_LAYOUT_VERSION;
+ oa->o_layout_version = io->ci_layout_version;
+ }
} else {
LASSERT(oio->oi_lockless == 0);
}
@@ -1944,6 +1944,7 @@ int osc_build_rpc(const struct lu_env *env, struct client_obd *cli,
bool soft_sync = false;
int grant = 0;
bool ndelay = false;
+ u32 layout_version = 0;
int i;
int rc;
struct ost_body *body;
@@ -1957,6 +1958,7 @@ int osc_build_rpc(const struct lu_env *env, struct client_obd *cli,
mem_tight |= ext->oe_memalloc;
grant += ext->oe_grants;
page_count += ext->oe_nr_pages;
+ layout_version = max(layout_version, ext->oe_layout_version);
if (!obj)
obj = ext->oe_obj;
}
@@ -2016,8 +2018,16 @@ int osc_build_rpc(const struct lu_env *env, struct client_obd *cli,
crattr->cra_oa = oa;
cl_req_attr_set(env, osc2cl(obj), crattr);
- if (cmd == OBD_BRW_WRITE)
+ if (cmd == OBD_BRW_WRITE) {
oa->o_grant_used = grant;
+ if (layout_version > 0) {
+ CDEBUG(D_LAYOUT, DFID": write with layout version %u\n",
+ PFID(&oa->o_oi.oi_fid), layout_version);
+
+ oa->o_layout_version = layout_version;
+ oa->o_valid |= OBD_MD_LAYOUT_VERSION;
+ }
+ }
sort_brw_pages(pga, page_count);
rc = osc_brw_prep_request(cmd, cli, oa, page_count, pga, &req, 1, 0);
@@ -1619,7 +1619,7 @@ static void lustre_swab_obdo(struct obdo *o)
__swab32s(&o->o_stripe_idx);
__swab32s(&o->o_parent_ver);
lustre_swab_ost_layout(&o->o_layout);
- BUILD_BUG_ON(offsetof(typeof(*o), o_padding_3) == 0);
+ __swab32s(&o->o_layout_version);
__swab32s(&o->o_uid_h);
__swab32s(&o->o_gid_h);
__swab64s(&o->o_data_version);
@@ -2374,12 +2374,17 @@ void lustre_swab_hsm_user_item(struct hsm_user_item *hui)
lustre_swab_hsm_extent(&hui->hui_extent);
}
+void lustre_swab_lu_extent(struct lu_extent *le)
+{
+ __swab64s(&le->e_start);
+ __swab64s(&le->e_end);
+}
+
void lustre_swab_layout_intent(struct layout_intent *li)
{
__swab32s(&li->li_opc);
__swab32s(&li->li_flags);
- __swab64s(&li->li_start);
- __swab64s(&li->li_end);
+ lustre_swab_lu_extent(&li->li_extent);
}
void lustre_swab_hsm_progress_kernel(struct hsm_progress_kernel *hpk)
@@ -1247,10 +1247,10 @@ void lustre_assert_wire_constants(void)
(long long)(int)offsetof(struct obdo, o_layout));
LASSERTF((int)sizeof(((struct obdo *)0)->o_layout) == 28, "found %lld\n",
(long long)(int)sizeof(((struct obdo *)0)->o_layout));
- LASSERTF((int)offsetof(struct obdo, o_padding_3) == 164, "found %lld\n",
- (long long)(int)offsetof(struct obdo, o_padding_3));
- LASSERTF((int)sizeof(((struct obdo *)0)->o_padding_3) == 4, "found %lld\n",
- (long long)(int)sizeof(((struct obdo *)0)->o_padding_3));
+ LASSERTF((int)offsetof(struct obdo, o_layout_version) == 164, "found %lld\n",
+ (long long)(int)offsetof(struct obdo, o_layout_version));
+ LASSERTF((int)sizeof(((struct obdo *)0)->o_layout_version) == 4, "found %lld\n",
+ (long long)(int)sizeof(((struct obdo *)0)->o_layout_version));
LASSERTF((int)offsetof(struct obdo, o_uid_h) == 168, "found %lld\n",
(long long)(int)offsetof(struct obdo, o_uid_h));
LASSERTF((int)sizeof(((struct obdo *)0)->o_uid_h) == 4, "found %lld\n",
@@ -4049,14 +4049,10 @@ void lustre_assert_wire_constants(void)
(long long)(int)offsetof(struct layout_intent, li_flags));
LASSERTF((int)sizeof(((struct layout_intent *)0)->li_flags) == 4, "found %lld\n",
(long long)(int)sizeof(((struct layout_intent *)0)->li_flags));
- LASSERTF((int)offsetof(struct layout_intent, li_start) == 8, "found %lld\n",
- (long long)(int)offsetof(struct layout_intent, li_start));
- LASSERTF((int)sizeof(((struct layout_intent *)0)->li_start) == 8, "found %lld\n",
- (long long)(int)sizeof(((struct layout_intent *)0)->li_start));
- LASSERTF((int)offsetof(struct layout_intent, li_end) == 16, "found %lld\n",
- (long long)(int)offsetof(struct layout_intent, li_end));
- LASSERTF((int)sizeof(((struct layout_intent *)0)->li_end) == 8, "found %lld\n",
- (long long)(int)sizeof(((struct layout_intent *)0)->li_end));
+ LASSERTF((int)offsetof(struct layout_intent, li_extent) == 8, "found %lld\n",
+ (long long)(int)offsetof(struct layout_intent, li_extent));
+ LASSERTF((int)sizeof(((struct layout_intent *)0)->li_extent) == 16, "found %lld\n",
+ (long long)(int)sizeof(((struct layout_intent *)0)->li_extent));
LASSERTF(LAYOUT_INTENT_ACCESS == 0, "found %lld\n",
(long long)LAYOUT_INTENT_ACCESS);
LASSERTF(LAYOUT_INTENT_READ == 1, "found %lld\n",
@@ -1092,7 +1092,9 @@ static inline __u32 lov_mds_md_size(__u16 stripes, __u32 lmm_magic)
#define OBD_MD_DOM_SIZE (0x00001000ULL) /* Data-on-MDT component size */
#define OBD_MD_FLNLINK (0x00002000ULL) /* link count */
#define OBD_MD_FLGENER (0x00004000ULL) /* generation number */
-/*#define OBD_MD_FLINLINE (0x00008000ULL) inline data. used until 1.6.5 */
+#define OBD_MD_LAYOUT_VERSION (0x00008000ULL) /* layout version for
+ * OST objects
+ */
#define OBD_MD_FLRDEV (0x00010000ULL) /* device number */
#define OBD_MD_FLEASIZE (0x00020000ULL) /* extended attribute data */
#define OBD_MD_LINKNAME (0x00040000ULL) /* symbolic link target */
@@ -2675,7 +2677,7 @@ struct obdo {
* sizeof(ost_layout) + sizeof(__u32) == sizeof(llog_cookie).
*/
struct ost_layout o_layout;
- __u32 o_padding_3;
+ __u32 o_layout_version;
__u32 o_uid_h;
__u32 o_gid_h;
@@ -453,6 +453,11 @@ enum lov_comp_md_entry_flags {
#define LCME_KNOWN_FLAGS (LCME_FL_NEG | LCME_FL_INIT)
+/* the highest bit in obdo::o_layout_version is used to mark if the file is
+ * being resynced.
+ */
+#define LU_LAYOUT_RESYNC LCME_FL_NEG
+
/* lcme_id can be specified as certain flags, and the first
* bit of lcme_id is used to indicate that the ID is representing
* certain LCME_FL_* but not a real ID. Which implies we can have
@@ -834,6 +839,8 @@ enum changelog_rec_type {
CL_MTIME = 17, /* Precedence: setattr > mtime > ctime > atime */
CL_CTIME = 18,
CL_ATIME = 19,
+ CL_FLRW = 21, /* FLR: file was firstly written */
+ CL_RESYNC = 22, /* FLR: file was resync-ed */
CL_LAST
};
@@ -842,7 +849,8 @@ static inline const char *changelog_type2str(int type)
static const char *changelog_str[] = {
"MARK", "CREAT", "MKDIR", "HLINK", "SLINK", "MKNOD", "UNLNK",
"RMDIR", "RENME", "RNMTO", "OPEN", "CLOSE", "LYOUT", "TRUNC",
- "SATTR", "XATTR", "HSM", "MTIME", "CTIME", "ATIME",
+ "SATTR", "XATTR", "HSM", "MTIME", "CTIME", "ATIME", "",
+ "FLRW", "RESYNC",
};
if (type >= 0 && type < CL_LAST)