@@ -706,6 +706,8 @@ enum md_op_flags {
MF_MDC_CANCEL_FID4 = BIT(3),
MF_GET_MDT_IDX = BIT(4),
MF_GETATTR_BY_FID = BIT(5),
+ MF_QOS_MKDIR = BIT(6),
+ MF_RR_MKDIR = BIT(7),
};
enum md_cli_flags {
@@ -795,6 +797,9 @@ struct md_op_data {
u32 op_projid;
+ /* mkdir */
+ unsigned short op_dir_depth;
+
u16 op_mirror_id;
/*
@@ -442,6 +442,8 @@ static int ll_dir_setdirstripe(struct dentry *dparent, struct lmv_user_md *lump,
if (IS_ERR(op_data))
return PTR_ERR(op_data);
+ op_data->op_dir_depth = ll_i2info(parent)->lli_depth;
+
if (ll_sbi_has_encrypt(sbi) &&
(IS_ENCRYPTED(parent) ||
unlikely(fscrypt_dummy_context_enabled(parent)))) {
@@ -676,8 +676,11 @@ static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
* of kernel will deal with that later.
*/
ll_set_lock_data(sbi->ll_md_exp, inode, itp, &bits);
- if (bits & MDS_INODELOCK_LOOKUP)
+ if (bits & MDS_INODELOCK_LOOKUP) {
d_lustre_revalidate(de);
+ ll_update_dir_depth(parent->d_inode, d_inode(de));
+ }
+
/* if DoM bit returned along with LAYOUT bit then there
* can be read-on-open data returned.
*/
@@ -178,13 +178,15 @@ struct ll_inode_info {
* -- I am the owner of dir statahead.
*/
pid_t lli_opendir_pid;
+ /* directory depth to ROOT */
+ unsigned short lli_depth;
/* stat will try to access statahead entries or start
* statahead if this flag is set, and this flag will be
* set upon dir open, and cleared when dir is closed,
* statahead hit ratio is too low, or start statahead
* thread failed.
*/
- unsigned int lli_sa_enabled:1;
+ unsigned short lli_sa_enabled:1;
/* generation for statahead */
unsigned int lli_sa_generation;
/* rw lock protects lli_lsm_md */
@@ -1215,6 +1217,7 @@ int ll_statfs_internal(struct ll_sb_info *sbi, struct obd_statfs *osfs,
u32 flags);
int ll_update_inode(struct inode *inode, struct lustre_md *md);
void ll_update_inode_flags(struct inode *inode, unsigned int ext_flags);
+void ll_update_dir_depth(struct inode *dir, struct inode *inode);
int ll_read_inode2(struct inode *inode, void *opaque);
void ll_truncate_inode_pages_final(struct inode *inode);
void ll_delete_inode(struct inode *inode);
@@ -2483,6 +2483,23 @@ int ll_update_inode(struct inode *inode, struct lustre_md *md)
return 0;
}
+/* update directory depth to ROOT, called after LOOKUP lock is fetched. */
+void ll_update_dir_depth(struct inode *dir, struct inode *inode)
+{
+ struct ll_inode_info *lli;
+
+ if (!S_ISDIR(inode->i_mode))
+ return;
+
+ if (inode == dir)
+ return;
+
+ lli = ll_i2info(inode);
+ lli->lli_depth = ll_i2info(dir)->lli_depth + 1;
+ CDEBUG(D_INODE, DFID" depth %hu\n", PFID(&lli->lli_fid),
+ lli->lli_depth);
+}
+
void ll_truncate_inode_pages_final(struct inode *inode)
{
struct address_space *mapping = &inode->i_data;
@@ -741,8 +741,10 @@ static int ll_lookup_it_finish(struct ptlrpc_request *request,
if (!it_disposition(it, DISP_LOOKUP_NEG)) {
/* We have the "lookup" lock, so unhide dentry */
- if (bits & MDS_INODELOCK_LOOKUP)
+ if (bits & MDS_INODELOCK_LOOKUP) {
d_lustre_revalidate(*de);
+ ll_update_dir_depth(parent, d_inode(*de));
+ }
if (encrypt) {
rc = fscrypt_get_encryption_info(inode);
@@ -1415,10 +1417,6 @@ static int ll_create_it(struct inode *dir, struct dentry *dentry,
return rc;
}
- ll_set_lock_data(ll_i2sbi(dir)->ll_md_exp, inode, it, &bits);
- if (bits & MDS_INODELOCK_LOOKUP)
- d_lustre_revalidate(dentry);
-
d_instantiate(dentry, inode);
if (encrypt) {
@@ -1427,8 +1425,17 @@ static int ll_create_it(struct inode *dir, struct dentry *dentry,
return rc;
}
- if (!(ll_i2sbi(inode)->ll_flags & LL_SBI_FILE_SECCTX))
+ if (!(ll_i2sbi(inode)->ll_flags & LL_SBI_FILE_SECCTX)) {
rc = ll_inode_init_security(dentry, inode, dir);
+ if (rc)
+ return rc;
+ }
+
+ ll_set_lock_data(ll_i2sbi(dir)->ll_md_exp, inode, it, &bits);
+ if (bits & MDS_INODELOCK_LOOKUP) {
+ d_lustre_revalidate(dentry);
+ ll_update_dir_depth(dir, inode);
+ }
return rc;
}
@@ -1451,6 +1458,58 @@ void ll_update_times(struct ptlrpc_request *request, struct inode *inode)
inode->i_ctime.tv_sec = body->mbo_ctime;
}
+/* once default LMV (space balanced) is set on ROOT, it should take effect if
+ * default LMV is not set on parent directory.
+ */
+static void ll_qos_mkdir_prep(struct md_op_data *op_data, struct inode *dir)
+{
+ struct inode *root = dir->i_sb->s_root->d_inode;
+ struct ll_inode_info *rlli = ll_i2info(root);
+ struct ll_inode_info *lli = ll_i2info(dir);
+ struct lmv_stripe_md *lsm;
+
+ op_data->op_dir_depth = lli->lli_depth;
+
+ /* parent directory is striped */
+ if (unlikely(lli->lli_lsm_md))
+ return;
+
+ /* default LMV set on parent directory */
+ if (unlikely(lli->lli_default_lsm_md))
+ return;
+
+ /* parent is ROOT */
+ if (unlikely(dir == root))
+ return;
+
+ /* default LMV not set on ROOT */
+ if (!rlli->lli_default_lsm_md)
+ return;
+
+ down_read(&rlli->lli_lsm_sem);
+ lsm = rlli->lli_default_lsm_md;
+ if (!lsm)
+ goto unlock;
+
+ /* not space balanced */
+ if (lsm->lsm_md_master_mdt_index != LMV_OFFSET_DEFAULT)
+ goto unlock;
+
+ if (lsm->lsm_md_max_inherit != LMV_INHERIT_NONE &&
+ (lsm->lsm_md_max_inherit == LMV_INHERIT_UNLIMITED ||
+ lsm->lsm_md_max_inherit >= lli->lli_depth)) {
+ op_data->op_flags |= MF_QOS_MKDIR;
+ if (lsm->lsm_md_max_inherit_rr != LMV_INHERIT_RR_NONE &&
+ (lsm->lsm_md_max_inherit_rr == LMV_INHERIT_RR_UNLIMITED ||
+ lsm->lsm_md_max_inherit_rr >= lli->lli_depth))
+ op_data->op_flags |= MF_RR_MKDIR;
+ CDEBUG(D_INODE, DFID" requests qos mkdir %#x\n",
+ PFID(&lli->lli_fid), op_data->op_flags);
+ }
+unlock:
+ up_read(&rlli->lli_lsm_sem);
+}
+
static int ll_new_node(struct inode *dir, struct dentry *dentry,
const char *tgt, umode_t mode, int rdev,
u32 opc)
@@ -1475,6 +1534,9 @@ static int ll_new_node(struct inode *dir, struct dentry *dentry,
goto err_exit;
}
+ if (S_ISDIR(mode))
+ ll_qos_mkdir_prep(op_data, dir);
+
if (sbi->ll_flags & LL_SBI_FILE_SECCTX) {
err = ll_dentry_init_security(dentry, mode, &dentry->d_name,
&op_data->op_file_secctx_name,
@@ -1488,8 +1488,11 @@ static int revalidate_statahead_dentry(struct inode *dir,
}
if ((bits & MDS_INODELOCK_LOOKUP) &&
- d_lustre_invalid(*dentryp))
+ d_lustre_invalid(*dentryp)) {
d_lustre_revalidate(*dentryp);
+ ll_update_dir_depth(dir, (*dentryp)->d_inode);
+ }
+
ll_intent_release(&it);
}
}
@@ -1427,7 +1427,8 @@ static int lmv_close(struct obd_export *exp, struct md_op_data *op_data,
return md_close(tgt->ltd_exp, op_data, mod, request);
}
-static struct lu_tgt_desc *lmv_locate_tgt_qos(struct lmv_obd *lmv, u32 *mdt)
+static struct lu_tgt_desc *lmv_locate_tgt_qos(struct lmv_obd *lmv, u32 *mdt,
+ unsigned short dir_depth)
{
struct lu_tgt_desc *tgt, *cur = NULL;
u64 total_avail = 0;
@@ -1470,10 +1471,10 @@ static struct lu_tgt_desc *lmv_locate_tgt_qos(struct lmv_obd *lmv, u32 *mdt)
/* if current MDT has above-average space, within range of the QOS
* threshold, stay on the same MDT to avoid creating needless remote
- * MDT directories.
+ * MDT directories. It's more likely for low level directories.
*/
rand = total_avail * (256 - lmv->lmv_qos.lq_threshold_rr) /
- (total_usable * 256);
+ (total_usable * 256 * (1 + dir_depth / 4));
if (cur && cur->ltd_qos.ltq_avail >= rand) {
tgt = cur;
rc = 0;
@@ -1727,12 +1728,14 @@ static inline bool lmv_op_default_qos_mkdir(const struct md_op_data *op_data)
{
const struct lmv_stripe_md *lsm = op_data->op_default_mea1;
- return lsm && lsm->lsm_md_master_mdt_index == LMV_OFFSET_DEFAULT;
+ return (op_data->op_flags & MF_QOS_MKDIR) ||
+ (lsm && lsm->lsm_md_master_mdt_index == LMV_OFFSET_DEFAULT);
}
-/* mkdir by QoS in two cases:
- * 1. 'lfs mkdir -i -1'
- * 2. parent default LMV master_mdt_index is -1
+/* mkdir by QoS in three cases:
+ * 1. ROOT default LMV is space balanced.
+ * 2. 'lfs mkdir -i -1'
+ * 3. parent default LMV master_mdt_index is -1
*
* NB, mkdir by QoS only if parent is not striped, this is to avoid remote
* directories under striped directory.
@@ -1754,11 +1757,12 @@ static inline bool lmv_op_qos_mkdir(const struct md_op_data *op_data)
return false;
}
-/* if default LMV is set, and its index is LMV_OFFSET_DEFAULT, and
- * 1. max_inherit_rr is set and is not LMV_INHERIT_RR_NONE
+/* if parent default LMV is space balanced, and
+ * 1. max_inherit_rr is set
* 2. or parent is ROOT
- * mkdir roundrobin.
- * NB, this also needs to check server is balanced, which is checked by caller.
+ * mkdir roundrobin. Or if parent doesn't have default LMV, while ROOT default
+ * LMV requests roundrobin mkdir, do the same.
+ * NB, this needs to check server is balanced, which is done by caller.
*/
static inline bool lmv_op_default_rr_mkdir(const struct md_op_data *op_data)
{
@@ -1767,7 +1771,8 @@ static inline bool lmv_op_default_rr_mkdir(const struct md_op_data *op_data)
if (!lmv_op_default_qos_mkdir(op_data))
return false;
- return lsm->lsm_md_max_inherit_rr != LMV_INHERIT_RR_NONE ||
+ return (op_data->op_flags & MF_RR_MKDIR) ||
+ (lsm && lsm->lsm_md_max_inherit_rr != LMV_INHERIT_RR_NONE) ||
fid_is_root(&op_data->op_fid1);
}
@@ -1842,7 +1847,8 @@ int lmv_create(struct obd_export *exp, struct md_op_data *op_data,
} else if (lmv_op_qos_mkdir(op_data)) {
struct lmv_tgt_desc *tmp = tgt;
- tgt = lmv_locate_tgt_qos(lmv, &op_data->op_mds);
+ tgt = lmv_locate_tgt_qos(lmv, &op_data->op_mds,
+ op_data->op_dir_depth);
if (tgt == ERR_PTR(-EAGAIN)) {
if (ltd_qos_is_balanced(&lmv->lmv_mdt_descs) &&
!lmv_op_default_rr_mkdir(op_data) &&
@@ -121,10 +121,21 @@ static ssize_t qos_prio_free_store(struct kobject *kobj,
struct obd_device *obd = container_of(kobj, struct obd_device,
obd_kset.kobj);
struct lmv_obd *lmv = &obd->u.lmv;
+ char buf[6], *tmp;
unsigned int val;
int rc;
- rc = kstrtouint(buffer, 0, &val);
+ /* "100%\n\0" should be largest string */
+ if (count >= sizeof(buf))
+ return -ERANGE;
+
+ strncpy(buf, buffer, sizeof(buf));
+ buf[sizeof(buf) - 1] = '\0';
+ tmp = strchr(buf, '%');
+ if (tmp)
+ *tmp = '\0';
+
+ rc = kstrtouint(buf, 0, &val);
if (rc)
return rc;
@@ -158,10 +169,21 @@ static ssize_t qos_threshold_rr_store(struct kobject *kobj,
struct obd_device *obd = container_of(kobj, struct obd_device,
obd_kset.kobj);
struct lmv_obd *lmv = &obd->u.lmv;
+ char buf[6], *tmp;
unsigned int val;
int rc;
- rc = kstrtouint(buffer, 0, &val);
+ /* "100%\n\0" should be largest string */
+ if (count >= sizeof(buf))
+ return -ERANGE;
+
+ strncpy(buf, buffer, sizeof(buf));
+ buf[sizeof(buf) - 1] = '\0';
+ tmp = strchr(buf, '%');
+ if (tmp)
+ *tmp = '\0';
+
+ rc = kstrtouint(buf, 0, &val);
if (rc)
return rc;
@@ -848,6 +848,8 @@ enum {
LMV_INHERIT_RR_DEFAULT = 0,
/* not inherit any more */
LMV_INHERIT_RR_END = 1,
+ /* default inherit_rr of ROOT */
+ LMV_INHERIT_RR_ROOT = 3,
/* max inherit depth */
LMV_INHERIT_RR_MAX = 250,
/* [251, 254] are reserved */