@@ -55,12 +55,6 @@ struct lmv_stripe_md {
struct lmv_oinfo lsm_md_oinfo[0];
};
-static inline bool lmv_is_known_hash_type(u32 type)
-{
- return (type & LMV_HASH_TYPE_MASK) == LMV_HASH_TYPE_FNV_1A_64 ||
- (type & LMV_HASH_TYPE_MASK) == LMV_HASH_TYPE_ALL_CHARS;
-}
-
static inline bool lmv_dir_striped(const struct lmv_stripe_md *lsm)
{
return lsm && lsm->lsm_md_magic == LMV_MAGIC;
@@ -89,12 +83,6 @@ static inline bool lmv_dir_bad_hash(const struct lmv_stripe_md *lsm)
return !lmv_is_known_hash_type(lsm->lsm_md_hash_type);
}
-/* NB, this is checking directory default LMV */
-static inline bool lmv_dir_qos_mkdir(const struct lmv_stripe_md *lsm)
-{
- return lsm && (lsm->lsm_md_hash_type & LMV_HASH_FLAG_SPACE);
-}
-
static inline bool
lsm_md_eq(const struct lmv_stripe_md *lsm1, const struct lmv_stripe_md *lsm2)
{
@@ -306,22 +306,10 @@ static int lmv_intent_open(struct obd_export *exp, struct md_op_data *op_data,
/*
* open(O_CREAT | O_EXCL) needs to check
* existing name, which should be done on both
- * old and new layout, to avoid creating new
- * file under old layout, check old layout on
+ * old and new layout, check old layout on
* client side.
*/
- tgt = lmv_locate_tgt(lmv, op_data);
- if (IS_ERR(tgt))
- return PTR_ERR(tgt);
-
- rc = md_getattr_name(tgt->ltd_exp, op_data,
- reqp);
- if (!rc) {
- ptlrpc_req_finished(*reqp);
- *reqp = NULL;
- return -EEXIST;
- }
-
+ rc = lmv_migrate_existence_check(lmv, op_data);
if (rc != -ENOENT)
return rc;
@@ -49,7 +49,6 @@ int lmv_intent_lock(struct obd_export *exp, struct md_op_data *op_data,
u64 extra_lock_flags);
int lmv_fld_lookup(struct lmv_obd *lmv, const struct lu_fid *fid, u32 *mds);
-int __lmv_fid_alloc(struct lmv_obd *lmv, struct lu_fid *fid, u32 mds);
int lmv_fid_alloc(const struct lu_env *env, struct obd_export *exp,
struct lu_fid *fid, struct md_op_data *op_data);
@@ -217,8 +216,9 @@ static inline bool lmv_dir_retry_check_update(struct md_op_data *op_data)
struct lmv_tgt_desc *lmv_locate_tgt(struct lmv_obd *lmv,
struct md_op_data *op_data);
+int lmv_migrate_existence_check(struct lmv_obd *lmv,
+ struct md_op_data *op_data);
/* lproc_lmv.c */
int lmv_tunables_init(struct obd_device *obd);
-
#endif
@@ -1045,106 +1045,36 @@ static int lmv_iocontrol(unsigned int cmd, struct obd_export *exp,
return rc;
}
-/**
- * This is _inode_ placement policy function (not name).
- */
-static u32 lmv_placement_policy(struct obd_device *obd,
- struct md_op_data *op_data)
+int lmv_fid_alloc(const struct lu_env *env, struct obd_export *exp,
+ struct lu_fid *fid, struct md_op_data *op_data)
{
+ struct obd_device *obd = class_exp2obd(exp);
struct lmv_obd *lmv = &obd->u.lmv;
- struct lmv_user_md *lum;
- u32 mdt;
-
- if (lmv->lmv_mdt_count == 1)
- return 0;
-
- lum = op_data->op_data;
- /*
- * Choose MDT by
- * 1. See if the stripe offset is specified by lum.
- * 2. If parent has default LMV, and its hash type is "space", choose
- * MDT with QoS. (see lmv_locate_tgt_qos()).
- * 3. Then check if default LMV stripe offset is not -1.
- * 4. Finally choose MDS by name hash if the parent
- * is striped directory. (see lmv_locate_tgt()).
- *
- * presently explicit MDT location is not supported
- * for foreign dirs (as it can't be embedded into free
- * format LMV, like with lum_stripe_offset), so we only
- * rely on default stripe offset or then name hashing.
- */
- if (op_data->op_cli_flags & CLI_SET_MEA && lum &&
- le32_to_cpu(lum->lum_magic != LMV_MAGIC_FOREIGN) &&
- le32_to_cpu(lum->lum_stripe_offset) != (u32)-1) {
- mdt = le32_to_cpu(lum->lum_stripe_offset);
- } else if (op_data->op_code == LUSTRE_OPC_MKDIR &&
- !lmv_dir_striped(op_data->op_mea1) &&
- lmv_dir_qos_mkdir(op_data->op_default_mea1)) {
- mdt = op_data->op_mds;
- } else if (op_data->op_code == LUSTRE_OPC_MKDIR &&
- op_data->op_default_mea1 &&
- op_data->op_default_mea1->lsm_md_master_mdt_index !=
- (u32)-1) {
- mdt = op_data->op_default_mea1->lsm_md_master_mdt_index;
- op_data->op_mds = mdt;
- } else {
- mdt = op_data->op_mds;
- }
-
- return mdt;
-}
-
-int __lmv_fid_alloc(struct lmv_obd *lmv, struct lu_fid *fid, u32 mds)
-{
struct lmv_tgt_desc *tgt;
int rc;
- tgt = lmv_tgt(lmv, mds);
+ LASSERT(op_data);
+ LASSERT(fid);
+
+ tgt = lmv_tgt(lmv, op_data->op_mds);
if (!tgt)
return -ENODEV;
+ if (!tgt->ltd_active || !tgt->ltd_exp)
+ return -ENODEV;
+
/*
* New seq alloc and FLD setup should be atomic. Otherwise we may find
* on server that seq in new allocated fid is not yet known.
*/
mutex_lock(&tgt->ltd_fid_mutex);
-
- if (tgt->ltd_active == 0 || !tgt->ltd_exp) {
- rc = -ENODEV;
- goto out;
- }
-
- /*
- * Asking underlaying tgt layer to allocate new fid.
- */
rc = obd_fid_alloc(NULL, tgt->ltd_exp, fid, NULL);
+ mutex_unlock(&tgt->ltd_fid_mutex);
if (rc > 0) {
LASSERT(fid_is_sane(fid));
rc = 0;
}
-out:
- mutex_unlock(&tgt->ltd_fid_mutex);
- return rc;
-}
-
-int lmv_fid_alloc(const struct lu_env *env, struct obd_export *exp,
- struct lu_fid *fid, struct md_op_data *op_data)
-{
- struct obd_device *obd = class_exp2obd(exp);
- struct lmv_obd *lmv = &obd->u.lmv;
- u32 mds;
- int rc;
-
- LASSERT(op_data);
- LASSERT(fid);
-
- mds = lmv_placement_policy(obd, op_data);
-
- rc = __lmv_fid_alloc(lmv, fid, mds);
- if (rc)
- CERROR("Can't alloc new fid, rc %d\n", rc);
-
return rc;
}
@@ -1624,8 +1554,7 @@ static struct lu_tgt_desc *lmv_locate_tgt_rr(struct lmv_obd *lmv, u32 *mdt)
* which is set outside, and if dir is migrating, 'op_data->op_post_migrate'
* indicates whether old or new layout is used to locate.
*
- * For plain direcotry, normally it will locate MDT by FID, but if this
- * directory has default LMV, and its hash type is "space", locate MDT with QoS.
+ * For plain direcotry, it just locate the MDT of op_data->op_fid1.
*
* @lmv: LMV device
* @op_data: client MD stack parameters, name, namelen
@@ -1650,7 +1579,7 @@ struct lmv_tgt_desc *
* ct_restore().
*/
if (op_data->op_bias & MDS_CREATE_VOLATILE &&
- (int)op_data->op_mds != -1) {
+ op_data->op_mds != LMV_OFFSET_DEFAULT) {
tgt = lmv_tgt(lmv, op_data->op_mds);
if (!tgt)
return ERR_PTR(-ENODEV);
@@ -1679,30 +1608,7 @@ struct lmv_tgt_desc *
tgt = lmv_tgt(lmv, oinfo->lmo_mds);
if (!tgt)
- tgt = ERR_PTR(-ENODEV);
- } else if (op_data->op_code == LUSTRE_OPC_MKDIR &&
- lmv_dir_qos_mkdir(op_data->op_default_mea1) &&
- !lmv_dir_striped(lsm)) {
- tgt = lmv_locate_tgt_qos(lmv, &op_data->op_mds);
- if (tgt == ERR_PTR(-EAGAIN))
- tgt = lmv_locate_tgt_rr(lmv, &op_data->op_mds);
- /*
- * only update statfs when mkdir under dir with "space" hash,
- * this means the cached statfs may be stale, and current mkdir
- * may not follow QoS accurately, but it's not serious, and it
- * avoids periodic statfs when client doesn't mkdir under
- * "space" hashed directories.
- *
- * TODO: after MDT support QoS object allocation, also update
- * statfs for 'lfs mkdir -i -1 ...", currently it's done in user
- * space.
- */
- if (!IS_ERR(tgt)) {
- struct obd_device *obd;
-
- obd = container_of(lmv, struct obd_device, u.lmv);
- lmv_statfs_check_update(obd, tgt);
- }
+ return ERR_PTR(-ENODEV);
} else {
tgt = lmv_locate_tgt_by_name(lmv, op_data->op_mea1,
op_data->op_name, op_data->op_namelen,
@@ -1755,6 +1661,78 @@ struct lmv_tgt_desc *
&op_data->op_mds, true);
}
+int lmv_migrate_existence_check(struct lmv_obd *lmv, struct md_op_data *op_data)
+{
+ struct lu_tgt_desc *tgt;
+ struct ptlrpc_request *request;
+ int rc;
+
+ LASSERT(lmv_dir_migrating(op_data->op_mea1));
+
+ tgt = lmv_locate_tgt(lmv, op_data);
+ if (IS_ERR(tgt))
+ return PTR_ERR(tgt);
+
+ rc = md_getattr_name(tgt->ltd_exp, op_data, &request);
+ if (!rc) {
+ ptlrpc_req_finished(request);
+ return -EEXIST;
+ }
+
+ return rc;
+}
+
+/* mkdir by QoS in two cases:
+ * 1. 'lfs mkdir -i -1'
+ * 2. parent default LMV master_mdt_index is -1
+ *
+ * NB, mkdir by QoS only if parent is not striped, this is to avoid remote
+ * directories under striped directory.
+ */
+static inline bool lmv_op_qos_mkdir(const struct md_op_data *op_data)
+{
+ const struct lmv_stripe_md *lsm = op_data->op_default_mea1;
+ const struct lmv_user_md *lum = op_data->op_data;
+
+ if (op_data->op_code != LUSTRE_OPC_MKDIR)
+ return false;
+
+ if (lmv_dir_striped(op_data->op_mea1))
+ return false;
+
+ if (op_data->op_cli_flags & CLI_SET_MEA && lum &&
+ (le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC ||
+ le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC_SPECIFIC) &&
+ le32_to_cpu(lum->lum_stripe_offset) == LMV_OFFSET_DEFAULT)
+ return true;
+
+ if (lsm && lsm->lsm_md_master_mdt_index == LMV_OFFSET_DEFAULT)
+ return true;
+
+ return false;
+}
+
+/* 'lfs mkdir -i <specific_MDT>' */
+static inline bool lmv_op_user_specific_mkdir(const struct md_op_data *op_data)
+{
+ const struct lmv_user_md *lum = op_data->op_data;
+
+ return op_data->op_code == LUSTRE_OPC_MKDIR &&
+ op_data->op_cli_flags & CLI_SET_MEA && lum &&
+ (le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC ||
+ le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC_SPECIFIC) &&
+ le32_to_cpu(lum->lum_stripe_offset) != LMV_OFFSET_DEFAULT;
+}
+
+/* parent default LMV master_mdt_index is not -1. */
+static inline bool
+lmv_op_default_specific_mkdir(const struct md_op_data *op_data)
+{
+ return op_data->op_code == LUSTRE_OPC_MKDIR &&
+ op_data->op_default_mea1 &&
+ op_data->op_default_mea1->lsm_md_master_mdt_index !=
+ LMV_OFFSET_DEFAULT;
+}
int lmv_create(struct obd_export *exp, struct md_op_data *op_data,
const void *data, size_t datalen, umode_t mode, uid_t uid,
gid_t gid, kernel_cap_t cap_effective, u64 rdev,
@@ -1774,20 +1752,9 @@ int lmv_create(struct obd_export *exp, struct md_op_data *op_data,
if (lmv_dir_migrating(op_data->op_mea1)) {
/*
* if parent is migrating, create() needs to lookup existing
- * name, to avoid creating new file under old layout of
- * migrating directory, check old layout here.
+ * name in both old and new layout, check old layout on client.
*/
- tgt = lmv_locate_tgt(lmv, op_data);
- if (IS_ERR(tgt))
- return PTR_ERR(tgt);
-
- rc = md_getattr_name(tgt->ltd_exp, op_data, request);
- if (!rc) {
- ptlrpc_req_finished(*request);
- *request = NULL;
- return -EEXIST;
- }
-
+ rc = lmv_migrate_existence_check(lmv, op_data);
if (rc != -ENOENT)
return rc;
@@ -1798,28 +1765,44 @@ int lmv_create(struct obd_export *exp, struct md_op_data *op_data,
if (IS_ERR(tgt))
return PTR_ERR(tgt);
- CDEBUG(D_INODE, "CREATE name '%.*s' on " DFID " -> mds #%x\n",
- (int)op_data->op_namelen, op_data->op_name,
- PFID(&op_data->op_fid1), op_data->op_mds);
-
- rc = lmv_fid_alloc(NULL, exp, &op_data->op_fid2, op_data);
- if (rc)
- return rc;
-
- if (exp_connect_flags(exp) & OBD_CONNECT_DIR_STRIPE) {
+ if (lmv_op_qos_mkdir(op_data)) {
+ tgt = lmv_locate_tgt_qos(lmv, &op_data->op_mds);
+ if (tgt == ERR_PTR(-EAGAIN))
+ tgt = lmv_locate_tgt_rr(lmv, &op_data->op_mds);
/*
- * Send the create request to the MDT where the object
- * will be located
+ * only update statfs after QoS mkdir, this means the cached
+ * statfs may be stale, and current mkdir may not follow QoS
+ * accurately, but it's not serious, and avoids periodic statfs
+ * when client doesn't mkdir by QoS.
*/
- tgt = lmv_fid2tgt(lmv, &op_data->op_fid2);
- if (IS_ERR(tgt))
- return PTR_ERR(tgt);
+ if (!IS_ERR(tgt))
+ lmv_statfs_check_update(obd, tgt);
+ } else if (lmv_op_user_specific_mkdir(op_data)) {
+ struct lmv_user_md *lum = op_data->op_data;
- op_data->op_mds = tgt->ltd_index;
+ op_data->op_mds = le32_to_cpu(lum->lum_stripe_offset);
+ tgt = lmv_tgt(lmv, op_data->op_mds);
+ if (!tgt)
+ return -ENODEV;
+ } else if (lmv_op_default_specific_mkdir(op_data)) {
+ op_data->op_mds =
+ op_data->op_default_mea1->lsm_md_master_mdt_index;
+ tgt = lmv_tgt(lmv, op_data->op_mds);
+ if (!tgt)
+ return -ENODEV;
}
- CDEBUG(D_INODE, "CREATE obj " DFID " -> mds #%x\n",
- PFID(&op_data->op_fid1), op_data->op_mds);
+ if (IS_ERR(tgt))
+ return PTR_ERR(tgt);
+
+ rc = lmv_fid_alloc(NULL, exp, &op_data->op_fid2, op_data);
+ if (rc)
+ return rc;
+
+ CDEBUG(D_INODE, "CREATE name '%.*s' "DFID" on " DFID " -> mds #%x\n",
+ (int)op_data->op_namelen, op_data->op_name,
+ PFID(&op_data->op_fid2), PFID(&op_data->op_fid1),
+ op_data->op_mds);
op_data->op_flags |= MF_MDC_CANCEL_FID1;
rc = md_create(tgt->ltd_exp, op_data, data, datalen, mode, uid, gid,
@@ -2063,10 +2046,20 @@ static int lmv_migrate(struct obd_export *exp, struct md_op_data *op_data,
if (IS_ERR(child_tgt))
return PTR_ERR(child_tgt);
- if (!S_ISDIR(op_data->op_mode) && tp_tgt)
- rc = __lmv_fid_alloc(lmv, &target_fid, tp_tgt->ltd_index);
- else
- rc = lmv_fid_alloc(NULL, exp, &target_fid, op_data);
+ /* for directory, migrate to MDT specified by lum_stripe_offset;
+ * otherwise migrate to the target stripe of parent, but parent
+ * directory may have finished migration (normally current file too),
+ * allocate FID on MDT lum_stripe_offset, and server will check
+ * whether file was migrated already.
+ */
+ if (S_ISDIR(op_data->op_mode) || !tp_tgt) {
+ struct lmv_user_md *lum = op_data->op_data;
+
+ op_data->op_mds = le32_to_cpu(lum->lum_stripe_offset);
+ } else {
+ op_data->op_mds = tp_tgt->ltd_index;
+ }
+ rc = lmv_fid_alloc(NULL, exp, &target_fid, op_data);
if (rc)
return rc;
@@ -3071,7 +3064,7 @@ static int lmv_unpack_md_v1(struct obd_export *exp, struct lmv_stripe_md *lsm,
* set default value -1, so lmv_locate_tgt() knows this stripe
* target is not initialized.
*/
- lsm->lsm_md_oinfo[i].lmo_mds = (u32)-1;
+ lsm->lsm_md_oinfo[i].lmo_mds = LMV_OFFSET_DEFAULT;
if (!fid_is_sane(&lsm->lsm_md_oinfo[i].lmo_fid))
continue;
@@ -106,10 +106,6 @@ int lu_qos_add_tgt(struct lu_qos *qos, struct lu_tgt_desc *tgt)
u32 id = 0;
int rc = 0;
- /* tgt not connected, this function will be called again later */
- if (!exp)
- return 0;
-
down_write(&qos->lq_rw_sem);
/*
* a bit hacky approach to learn NID of corresponding connection
@@ -528,7 +524,7 @@ int ltd_qos_penalties_calc(struct lu_tgt_descs *ltd)
* per-tgt penalty is
* prio * bavail * iavail / (num_tgt - 1) / 2
*/
- tgt->ltd_qos.ltq_penalty_per_obj = prio_wide * ba * ia;
+ tgt->ltd_qos.ltq_penalty_per_obj = prio_wide * ba * ia >> 8;
do_div(tgt->ltd_qos.ltq_penalty_per_obj, num_active);
tgt->ltd_qos.ltq_penalty_per_obj >>= 1;
@@ -562,8 +558,9 @@ int ltd_qos_penalties_calc(struct lu_tgt_descs *ltd)
list_for_each_entry(svr, &qos->lq_svr_list, lsq_svr_list) {
ba = svr->lsq_bavail;
ia = svr->lsq_iavail;
- svr->lsq_penalty_per_obj = prio_wide * ba * ia;
- do_div(ba, svr->lsq_tgt_count * num_active);
+ svr->lsq_penalty_per_obj = prio_wide * ba * ia >> 8;
+ do_div(svr->lsq_penalty_per_obj,
+ svr->lsq_tgt_count * num_active);
svr->lsq_penalty_per_obj >>= 1;
age = (now - svr->lsq_used) >> 3;
@@ -656,6 +653,7 @@ int ltd_qos_update(struct lu_tgt_descs *ltd, struct lu_tgt_desc *tgt,
if (!tgt->ltd_active)
continue;
+ ltq = &tgt->ltd_qos;
if (ltq->ltq_penalty < ltq->ltq_penalty_per_obj)
ltq->ltq_penalty = 0;
else
@@ -668,9 +666,10 @@ int ltd_qos_update(struct lu_tgt_descs *ltd, struct lu_tgt_desc *tgt,
*total_wt += ltq->ltq_weight;
CDEBUG(D_OTHER,
- "recalc tgt %d usable=%d avail=%llu tgtppo=%llu tgtp=%llu svrppo=%llu svrp=%llu wt=%llu\n",
+ "recalc tgt %d usable=%d bavail=%llu ffree=%llu tgtppo=%llu tgtp=%llu svrppo=%llu svrp=%llu wt=%llu\n",
tgt->ltd_index, ltq->ltq_usable,
- tgt_statfs_bavail(tgt) >> 10,
+ tgt_statfs_bavail(tgt) >> 16,
+ tgt_statfs_iavail(tgt) >> 8,
ltq->ltq_penalty_per_obj >> 10,
ltq->ltq_penalty >> 10,
ltq->ltq_svr->lsq_penalty_per_obj >> 10,
@@ -1663,7 +1663,6 @@ void lustre_assert_wire_constants(void)
BUILD_BUG_ON(LMV_MAGIC_V1 != 0x0CD20CD0);
BUILD_BUG_ON(LMV_MAGIC_STRIPE != 0x0CD40CD0);
BUILD_BUG_ON(LMV_HASH_TYPE_MASK != 0x0000ffff);
- BUILD_BUG_ON(LMV_HASH_FLAG_SPACE != 0x08000000);
BUILD_BUG_ON(LMV_HASH_FLAG_MIGRATION != 0x80000000);
/* Checks for struct obd_statfs */
@@ -429,6 +429,7 @@ static inline bool lov_pattern_supported_normal_comp(__u32 pattern)
#define LOV_MAXPOOLNAME 15
#define LOV_POOLNAMEF "%.15s"
#define LOV_OFFSET_DEFAULT ((__u16)-1)
+#define LMV_OFFSET_DEFAULT ((__u32)-1)
#define LOV_MIN_STRIPE_BITS 16 /* maximum PAGE_SIZE (ia64), power of 2 */
#define LOV_MIN_STRIPE_SIZE (1 << LOV_MIN_STRIPE_BITS)
@@ -687,10 +688,11 @@ enum lmv_hash_type {
*/
#define LMV_HASH_TYPE_MASK 0x0000ffff
-/* once this is set on a plain directory default layout, newly created
- * subdirectories will be distributed on all MDTs by space usage.
- */
-#define LMV_HASH_FLAG_SPACE 0x08000000
+static inline bool lmv_is_known_hash_type(__u32 type)
+{
+ return (type & LMV_HASH_TYPE_MASK) == LMV_HASH_TYPE_FNV_1A_64 ||
+ (type & LMV_HASH_TYPE_MASK) == LMV_HASH_TYPE_ALL_CHARS;
+}
/* The striped directory has ever lost its master LMV EA, then LFSCK
* re-generated it. This flag is used to indicate such case. It is an