diff mbox

ceph: optimize memory usage

Message ID 1520252268-69546-1-git-send-email-cgxu519@gmx.com (mailing list archive)
State New, archived
Headers show

Commit Message

Chengguang Xu March 5, 2018, 12:17 p.m. UTC
In current code, regular file and directory use same struct
ceph_file_info to store fs specific data so the struct has to
include some fields which are only used for directory
(e.g., readdir related info), when having plenty of regular files,
it will lead to memory waste.

This patch introduces dedicated readdir_info cache to separate readdir
related things from struct ceph_file_info, so that regular file could
not include those unused fields anymore. Also, chagned to manipulate
fscache only when reuglar file successfully acquired ceph_file_info.

Signed-off-by: Chengguang Xu <cgxu519@gmx.com>
---
 fs/ceph/dir.c                | 158 +++++++++++++++++++++++--------------------
 fs/ceph/file.c               |  88 ++++++++++++++++--------
 fs/ceph/super.c              |  14 +++-
 fs/ceph/super.h              |  19 ++++--
 include/linux/ceph/libceph.h |   1 +
 5 files changed, 170 insertions(+), 110 deletions(-)

Comments

Yan, Zheng March 6, 2018, 8:34 a.m. UTC | #1
On Mon, Mar 5, 2018 at 8:17 PM, Chengguang Xu <cgxu519@gmx.com> wrote:
> In current code, regular file and directory use same struct
> ceph_file_info to store fs specific data so the struct has to
> include some fields which are only used for directory
> (e.g., readdir related info), when having plenty of regular files,
> it will lead to memory waste.
>
> This patch introduces dedicated readdir_info cache to separate readdir
> related things from struct ceph_file_info, so that regular file could
> not include those unused fields anymore. Also, chagned to manipulate
> fscache only when reuglar file successfully acquired ceph_file_info.
>
> Signed-off-by: Chengguang Xu <cgxu519@gmx.com>
> ---
>  fs/ceph/dir.c                | 158 +++++++++++++++++++++++--------------------
>  fs/ceph/file.c               |  88 ++++++++++++++++--------
>  fs/ceph/super.c              |  14 +++-
>  fs/ceph/super.h              |  19 ++++--
>  include/linux/ceph/libceph.h |   1 +
>  5 files changed, 170 insertions(+), 110 deletions(-)
>
> diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
> index 0c43468..84bc75a 100644
> --- a/fs/ceph/dir.c
> +++ b/fs/ceph/dir.c
> @@ -105,15 +105,17 @@ static int fpos_cmp(loff_t l, loff_t r)
>  static int note_last_dentry(struct ceph_file_info *fi, const char *name,
>                             int len, unsigned next_offset)
>  {
> +       struct ceph_readdir_info *ri = fi->readdir_info;
>         char *buf = kmalloc(len+1, GFP_KERNEL);
>         if (!buf)
>                 return -ENOMEM;
> -       kfree(fi->last_name);
> -       fi->last_name = buf;
> -       memcpy(fi->last_name, name, len);
> -       fi->last_name[len] = 0;
> -       fi->next_offset = next_offset;
> -       dout("note_last_dentry '%s'\n", fi->last_name);
> +
> +       kfree(ri->last_name);
> +       ri->last_name = buf;
> +       memcpy(ri->last_name, name, len);
> +       ri->last_name[len] = 0;
> +       ri->next_offset = next_offset;
> +       dout("note_last_dentry '%s'\n", ri->last_name);
>         return 0;
>  }
>
> @@ -176,6 +178,7 @@ static int __dcache_readdir(struct file *file,  struct dir_context *ctx,
>                             int shared_gen)
>  {
>         struct ceph_file_info *fi = file->private_data;
> +       struct ceph_readdir_info *ri = fi->readdir_info;
>         struct dentry *parent = file->f_path.dentry;
>         struct inode *dir = d_inode(parent);
>         struct dentry *dentry, *last = NULL;
> @@ -279,9 +282,9 @@ static int __dcache_readdir(struct file *file,  struct dir_context *ctx,
>                         err = ret;
>                 dput(last);
>                 /* last_name no longer match cache index */
> -               if (fi->readdir_cache_idx >= 0) {
> -                       fi->readdir_cache_idx = -1;
> -                       fi->dir_release_count = 0;
> +               if (ri->readdir_cache_idx >= 0) {
> +                       ri->readdir_cache_idx = -1;
> +                       ri->dir_release_count = 0;
>                 }
>         }
>         return err;
> @@ -289,17 +292,20 @@ static int __dcache_readdir(struct file *file,  struct dir_context *ctx,
>
>  static bool need_send_readdir(struct ceph_file_info *fi, loff_t pos)
>  {
> -       if (!fi->last_readdir)
> +       struct ceph_readdir_info *ri = fi->readdir_info;
> +
> +       if (!ri->last_readdir)
>                 return true;
>         if (is_hash_order(pos))
> -               return !ceph_frag_contains_value(fi->frag, fpos_hash(pos));
> +               return !ceph_frag_contains_value(ri->frag, fpos_hash(pos));
>         else
> -               return fi->frag != fpos_frag(pos);
> +               return ri->frag != fpos_frag(pos);
>  }
>
>  static int ceph_readdir(struct file *file, struct dir_context *ctx)
>  {
>         struct ceph_file_info *fi = file->private_data;
> +       struct ceph_readdir_info *ri = fi->readdir_info;
>         struct inode *inode = file_inode(file);
>         struct ceph_inode_info *ci = ceph_inode(inode);
>         struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
> @@ -357,9 +363,9 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
>                         CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR;
>
>                 /* discard old result, if any */
> -               if (fi->last_readdir) {
> -                       ceph_mdsc_put_request(fi->last_readdir);
> -                       fi->last_readdir = NULL;
> +               if (ri->last_readdir) {
> +                       ceph_mdsc_put_request(ri->last_readdir);
> +                       ri->last_readdir = NULL;
>                 }
>
>                 if (is_hash_order(ctx->pos)) {
> @@ -373,7 +379,7 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
>                 }
>
>                 dout("readdir fetching %llx.%llx frag %x offset '%s'\n",
> -                    ceph_vinop(inode), frag, fi->last_name);
> +                    ceph_vinop(inode), frag, ri->last_name);
>                 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
>                 if (IS_ERR(req))
>                         return PTR_ERR(req);
> @@ -389,8 +395,8 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
>                         __set_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags);
>                         req->r_inode_drop = CEPH_CAP_FILE_EXCL;
>                 }
> -               if (fi->last_name) {
> -                       req->r_path2 = kstrdup(fi->last_name, GFP_KERNEL);
> +               if (ri->last_name) {
> +                       req->r_path2 = kstrdup(ri->last_name, GFP_KERNEL);
>                         if (!req->r_path2) {
>                                 ceph_mdsc_put_request(req);
>                                 return -ENOMEM;
> @@ -400,10 +406,10 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
>                                 cpu_to_le32(fpos_hash(ctx->pos));
>                 }
>
> -               req->r_dir_release_cnt = fi->dir_release_count;
> -               req->r_dir_ordered_cnt = fi->dir_ordered_count;
> -               req->r_readdir_cache_idx = fi->readdir_cache_idx;
> -               req->r_readdir_offset = fi->next_offset;
> +               req->r_dir_release_cnt = ri->dir_release_count;
> +               req->r_dir_ordered_cnt = ri->dir_ordered_count;
> +               req->r_readdir_cache_idx = ri->readdir_cache_idx;
> +               req->r_readdir_offset = ri->next_offset;
>                 req->r_args.readdir.frag = cpu_to_le32(frag);
>                 req->r_args.readdir.flags =
>                                 cpu_to_le16(CEPH_READDIR_REPLY_BITFLAGS);
> @@ -427,35 +433,35 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
>                 if (le32_to_cpu(rinfo->dir_dir->frag) != frag) {
>                         frag = le32_to_cpu(rinfo->dir_dir->frag);
>                         if (!rinfo->hash_order) {
> -                               fi->next_offset = req->r_readdir_offset;
> +                               ri->next_offset = req->r_readdir_offset;
>                                 /* adjust ctx->pos to beginning of frag */
>                                 ctx->pos = ceph_make_fpos(frag,
> -                                                         fi->next_offset,
> +                                                         ri->next_offset,
>                                                           false);
>                         }
>                 }
>
> -               fi->frag = frag;
> -               fi->last_readdir = req;
> +               ri->frag = frag;
> +               ri->last_readdir = req;
>
>                 if (test_bit(CEPH_MDS_R_DID_PREPOPULATE, &req->r_req_flags)) {
> -                       fi->readdir_cache_idx = req->r_readdir_cache_idx;
> -                       if (fi->readdir_cache_idx < 0) {
> +                       ri->readdir_cache_idx = req->r_readdir_cache_idx;
> +                       if (ri->readdir_cache_idx < 0) {
>                                 /* preclude from marking dir ordered */
> -                               fi->dir_ordered_count = 0;
> +                               ri->dir_ordered_count = 0;
>                         } else if (ceph_frag_is_leftmost(frag) &&
> -                                  fi->next_offset == 2) {
> +                                  ri->next_offset == 2) {
>                                 /* note dir version at start of readdir so
>                                  * we can tell if any dentries get dropped */
> -                               fi->dir_release_count = req->r_dir_release_cnt;
> -                               fi->dir_ordered_count = req->r_dir_ordered_cnt;
> +                               ri->dir_release_count = req->r_dir_release_cnt;
> +                               ri->dir_ordered_count = req->r_dir_ordered_cnt;
>                         }
>                 } else {
>                         dout("readdir !did_prepopulate");
>                         /* disable readdir cache */
> -                       fi->readdir_cache_idx = -1;
> +                       ri->readdir_cache_idx = -1;
>                         /* preclude from marking dir complete */
> -                       fi->dir_release_count = 0;
> +                       ri->dir_release_count = 0;
>                 }
>
>                 /* note next offset and last dentry name */
> @@ -469,14 +475,14 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
>                         if (err)
>                                 return err;
>                 } else if (req->r_reply_info.dir_end) {
> -                       fi->next_offset = 2;
> +                       ri->next_offset = 2;
>                         /* keep last name */
>                 }
>         }
>
> -       rinfo = &fi->last_readdir->r_reply_info;
> +       rinfo = &ri->last_readdir->r_reply_info;
>         dout("readdir frag %x num %d pos %llx chunk first %llx\n",
> -            fi->frag, rinfo->dir_nr, ctx->pos,
> +            ri->frag, rinfo->dir_nr, ctx->pos,
>              rinfo->dir_nr ? rinfo->dir_entries[0].offset : 0LL);
>
>         i = 0;
> @@ -520,27 +526,27 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
>                 ctx->pos++;
>         }
>
> -       ceph_mdsc_put_request(fi->last_readdir);
> -       fi->last_readdir = NULL;
> +       ceph_mdsc_put_request(ri->last_readdir);
> +       ri->last_readdir = NULL;
>
> -       if (fi->next_offset > 2) {
> -               frag = fi->frag;
> +       if (ri->next_offset > 2) {
> +               frag = ri->frag;
>                 goto more;
>         }
>
>         /* more frags? */
> -       if (!ceph_frag_is_rightmost(fi->frag)) {
> -               frag = ceph_frag_next(fi->frag);
> +       if (!ceph_frag_is_rightmost(ri->frag)) {
> +               frag = ceph_frag_next(ri->frag);
>                 if (is_hash_order(ctx->pos)) {
>                         loff_t new_pos = ceph_make_fpos(ceph_frag_value(frag),
> -                                                       fi->next_offset, true);
> +                                                       ri->next_offset, true);
>                         if (new_pos > ctx->pos)
>                                 ctx->pos = new_pos;
>                         /* keep last_name */
>                 } else {
> -                       ctx->pos = ceph_make_fpos(frag, fi->next_offset, false);
> -                       kfree(fi->last_name);
> -                       fi->last_name = NULL;
> +                       ctx->pos = ceph_make_fpos(frag, ri->next_offset, false);
> +                       kfree(ri->last_name);
> +                       ri->last_name = NULL;
>                 }
>                 dout("readdir next frag is %x\n", frag);
>                 goto more;
> @@ -552,20 +558,20 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
>          * were released during the whole readdir, and we should have
>          * the complete dir contents in our cache.
>          */
> -       if (atomic64_read(&ci->i_release_count) == fi->dir_release_count) {
> +       if (atomic64_read(&ci->i_release_count) == ri->dir_release_count) {
>                 spin_lock(&ci->i_ceph_lock);
> -               if (fi->dir_ordered_count == atomic64_read(&ci->i_ordered_count)) {
> +               if (ri->dir_ordered_count == atomic64_read(&ci->i_ordered_count)) {
>                         dout(" marking %p complete and ordered\n", inode);
>                         /* use i_size to track number of entries in
>                          * readdir cache */
> -                       BUG_ON(fi->readdir_cache_idx < 0);
> -                       i_size_write(inode, fi->readdir_cache_idx *
> +                       BUG_ON(ri->readdir_cache_idx < 0);
> +                       i_size_write(inode, ri->readdir_cache_idx *
>                                      sizeof(struct dentry*));
>                 } else {
>                         dout(" marking %p complete\n", inode);
>                 }
> -               __ceph_dir_set_complete(ci, fi->dir_release_count,
> -                                       fi->dir_ordered_count);
> +               __ceph_dir_set_complete(ci, ri->dir_release_count,
> +                                       ri->dir_ordered_count);
>                 spin_unlock(&ci->i_ceph_lock);
>         }
>
> @@ -575,15 +581,17 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
>
>  static void reset_readdir(struct ceph_file_info *fi)
>  {
> -       if (fi->last_readdir) {
> -               ceph_mdsc_put_request(fi->last_readdir);
> -               fi->last_readdir = NULL;
> +       struct ceph_readdir_info *ri = fi->readdir_info;
> +
> +       if (ri->last_readdir) {
> +               ceph_mdsc_put_request(ri->last_readdir);
> +               ri->last_readdir = NULL;
>         }
> -       kfree(fi->last_name);
> -       fi->last_name = NULL;
> -       fi->dir_release_count = 0;
> -       fi->readdir_cache_idx = -1;
> -       fi->next_offset = 2;  /* compensate for . and .. */
> +       kfree(ri->last_name);
> +       ri->last_name = NULL;
> +       ri->dir_release_count = 0;
> +       ri->readdir_cache_idx = -1;
> +       ri->next_offset = 2;  /* compensate for . and .. */
>         fi->flags &= ~CEPH_F_ATEND;
>  }
>
> @@ -594,16 +602,18 @@ static void reset_readdir(struct ceph_file_info *fi)
>  static bool need_reset_readdir(struct ceph_file_info *fi, loff_t new_pos)
>  {
>         struct ceph_mds_reply_info_parsed *rinfo;
> +       struct ceph_readdir_info *ri = fi->readdir_info;
>         loff_t chunk_offset;
> +
>         if (new_pos == 0)
>                 return true;
>         if (is_hash_order(new_pos)) {
>                 /* no need to reset last_name for a forward seek when
>                  * dentries are sotred in hash order */
> -       } else if (fi->frag != fpos_frag(new_pos)) {
> +       } else if (ri->frag != fpos_frag(new_pos)) {
>                 return true;
>         }
> -       rinfo = fi->last_readdir ? &fi->last_readdir->r_reply_info : NULL;
> +       rinfo = ri->last_readdir ? &ri->last_readdir->r_reply_info : NULL;
>         if (!rinfo || !rinfo->dir_nr)
>                 return true;
>         chunk_offset = rinfo->dir_entries[0].offset;
> @@ -614,6 +624,7 @@ static bool need_reset_readdir(struct ceph_file_info *fi, loff_t new_pos)
>  static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
>  {
>         struct ceph_file_info *fi = file->private_data;
> +       struct ceph_readdir_info *ri = fi->readdir_info;
>         struct inode *inode = file->f_mapping->host;
>         loff_t retval;
>
> @@ -637,8 +648,8 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
>                 } else if (is_hash_order(offset) && offset > file->f_pos) {
>                         /* for hash offset, we don't know if a forward seek
>                          * is within same frag */
> -                       fi->dir_release_count = 0;
> -                       fi->readdir_cache_idx = -1;
> +                       ri->dir_release_count = 0;
> +                       ri->readdir_cache_idx = -1;
>                 }
>
>                 if (offset != file->f_pos) {
> @@ -1371,6 +1382,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
>                              loff_t *ppos)
>  {
>         struct ceph_file_info *cf = file->private_data;
> +       struct ceph_readdir_info *ri = cf->readdir_info;
>         struct inode *inode = file_inode(file);
>         struct ceph_inode_info *ci = ceph_inode(inode);
>         int left;
> @@ -1379,12 +1391,12 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
>         if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT))
>                 return -EISDIR;
>
> -       if (!cf->dir_info) {
> -               cf->dir_info = kmalloc(bufsize, GFP_KERNEL);
> -               if (!cf->dir_info)
> +       if (!ri->dir_info) {
> +               ri->dir_info = kmalloc(bufsize, GFP_KERNEL);
> +               if (!ri->dir_info)
>                         return -ENOMEM;
> -               cf->dir_info_len =
> -                       snprintf(cf->dir_info, bufsize,
> +               ri->dir_info_len =
> +                       snprintf(ri->dir_info, bufsize,
>                                 "entries:   %20lld\n"
>                                 " files:    %20lld\n"
>                                 " subdirs:  %20lld\n"
> @@ -1404,10 +1416,10 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
>                                 (long)ci->i_rctime.tv_nsec);
>         }
>
> -       if (*ppos >= cf->dir_info_len)
> +       if (*ppos >= ri->dir_info_len)
>                 return 0;
> -       size = min_t(unsigned, size, cf->dir_info_len-*ppos);
> -       left = copy_to_user(buf, cf->dir_info + *ppos, size);
> +       size = min_t(unsigned int, size, ri->dir_info_len-*ppos);
> +       left = copy_to_user(buf, ri->dir_info + *ppos, size);
>         if (left == size)
>                 return -EFAULT;
>         *ppos += (size - left);
> diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> index 6639926..611aaef 100644
> --- a/fs/ceph/file.c
> +++ b/fs/ceph/file.c
> @@ -159,36 +159,60 @@ static size_t dio_get_pagev_size(const struct iov_iter *it)
>         return req;
>  }
>
> +
> +static int ceph_init_file_info(struct inode *inode,
> +                       struct file *file, int fmode)
> +{
> +       struct ceph_file_info *fi;
> +
> +       dout("init_file inode %p file %p mode 0%o (%s)\n", inode, file,
> +               inode->i_mode, S_ISREG(inode->i_mode) ? "regular" : "dir");
> +
> +       BUG_ON(inode->i_fop->release != ceph_release);
> +
> +       fi = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL);
> +       if (!fi) {
> +               ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
> +               return -ENOMEM;
> +       }
> +
> +       if (S_ISDIR(inode->i_mode)) {
> +               fi->readdir_info = kmem_cache_zalloc(ceph_readdir_cachep,
> +                                                       GFP_KERNEL);
> +               if (!fi->readdir_info) {
> +                       kmem_cache_free(ceph_file_cachep, fi);
> +                       return -ENOMEM;
> +               }
> +
> +               fi->readdir_info->next_offset = 2;
> +               fi->readdir_info->readdir_cache_idx = -1;
> +       } else {
> +               ceph_fscache_register_inode_cookie(inode);
> +               ceph_fscache_file_set_cookie(inode, file);
> +       }
> +
> +       fi->fmode = fmode;
> +       spin_lock_init(&fi->rw_contexts_lock);
> +       INIT_LIST_HEAD(&fi->rw_contexts);
> +       file->private_data = fi;
> +
> +       return 0;
> +}
> +
>  /*
>   * initialize private struct file data.
>   * if we fail, clean up by dropping fmode reference on the ceph_inode
>   */
>  static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
>  {
> -       struct ceph_file_info *cf;
>         int ret = 0;
>
>         switch (inode->i_mode & S_IFMT) {
>         case S_IFREG:
> -               ceph_fscache_register_inode_cookie(inode);
> -               ceph_fscache_file_set_cookie(inode, file);
>         case S_IFDIR:
> -               dout("init_file %p %p 0%o (regular)\n", inode, file,
> -                    inode->i_mode);
> -               cf = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL);
> -               if (!cf) {
> -                       ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
> -                       return -ENOMEM;
> -               }
> -               cf->fmode = fmode;
> -
> -               spin_lock_init(&cf->rw_contexts_lock);
> -               INIT_LIST_HEAD(&cf->rw_contexts);
> -
> -               cf->next_offset = 2;
> -               cf->readdir_cache_idx = -1;
> -               file->private_data = cf;
> -               BUG_ON(inode->i_fop->release != ceph_release);
> +               ret = ceph_init_file_info(inode, file, fmode);
> +               if (ret)
> +                       return ret;
>                 break;
>
>         case S_IFLNK:
> @@ -460,16 +484,24 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
>  int ceph_release(struct inode *inode, struct file *file)
>  {
>         struct ceph_inode_info *ci = ceph_inode(inode);
> -       struct ceph_file_info *cf = file->private_data;
> +       struct ceph_file_info *fi = file->private_data;
> +       struct ceph_readdir_info *ri = fi->readdir_info;
> +
> +       dout("release inode %p file %p mode 0%o (%s)\n", inode, file,
> +               inode->i_mode, S_ISREG(inode->i_mode) ? "regular" : "dir");
>
> -       dout("release inode %p file %p\n", inode, file);
> -       ceph_put_fmode(ci, cf->fmode);
> -       if (cf->last_readdir)
> -               ceph_mdsc_put_request(cf->last_readdir);
> -       kfree(cf->last_name);
> -       kfree(cf->dir_info);
> -       WARN_ON(!list_empty(&cf->rw_contexts));
> -       kmem_cache_free(ceph_file_cachep, cf);
> +       WARN_ON(!list_empty(&fi->rw_contexts));
> +
> +       ceph_put_fmode(ci, fi->fmode);
> +       if (ri) {
> +               if (ri->last_readdir)
> +                       ceph_mdsc_put_request(ri->last_readdir);
> +               kfree(ri->last_name);
> +               kfree(ri->dir_info);
> +               kmem_cache_free(ceph_readdir_cachep, ri);
> +               ri = NULL;
> +       }
> +       kmem_cache_free(ceph_file_cachep, fi);
>
>         /* wake up anyone waiting for caps on this inode */
>         wake_up_all(&ci->i_cap_wq);
> diff --git a/fs/ceph/super.c b/fs/ceph/super.c
> index a62d2a9..c1732ff 100644
> --- a/fs/ceph/super.c
> +++ b/fs/ceph/super.c
> @@ -677,6 +677,7 @@ static void destroy_fs_client(struct ceph_fs_client *fsc)
>  struct kmem_cache *ceph_cap_flush_cachep;
>  struct kmem_cache *ceph_dentry_cachep;
>  struct kmem_cache *ceph_file_cachep;
> +struct kmem_cache *ceph_readdir_cachep;
>
>  static void ceph_inode_init_once(void *foo)
>  {
> @@ -711,14 +712,22 @@ static int __init init_caches(void)
>                 goto bad_dentry;
>
>         ceph_file_cachep = KMEM_CACHE(ceph_file_info, SLAB_MEM_SPREAD);
> -
>         if (!ceph_file_cachep)
>                 goto bad_file;
>
> +       ceph_readdir_cachep = KMEM_CACHE(ceph_readdir_info, SLAB_MEM_SPREAD);
> +       if (!ceph_readdir_cachep)
> +               goto bad_readdir;
> +
>         if ((error = ceph_fscache_register()))
> -               goto bad_file;
> +               goto bad_fscache;
>
>         return 0;
> +
> +bad_fscache:
> +       kmem_cache_destroy(ceph_readdir_cachep);
> +bad_readdir:
> +       kmem_cache_destroy(ceph_file_cachep);
>  bad_file:
>         kmem_cache_destroy(ceph_dentry_cachep);
>  bad_dentry:
> @@ -743,6 +752,7 @@ static void destroy_caches(void)
>         kmem_cache_destroy(ceph_cap_flush_cachep);
>         kmem_cache_destroy(ceph_dentry_cachep);
>         kmem_cache_destroy(ceph_file_cachep);
> +       kmem_cache_destroy(ceph_readdir_cachep);
>
>         ceph_fscache_unregister();
>  }
> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> index 21b2e5b..3a943a0 100644
> --- a/fs/ceph/super.h
> +++ b/fs/ceph/super.h
> @@ -665,13 +665,7 @@ extern void ceph_reservation_status(struct ceph_fs_client *client,
>  #define CEPH_F_SYNC     1
>  #define CEPH_F_ATEND    2
>
> -struct ceph_file_info {
> -       short fmode;     /* initialized on open */
> -       short flags;     /* CEPH_F_* */
> -
> -       spinlock_t rw_contexts_lock;
> -       struct list_head rw_contexts;
> -
> +struct ceph_readdir_info {
>         /* readdir: position within the dir */
>         u32 frag;
>         struct ceph_mds_request *last_readdir;
> @@ -688,6 +682,17 @@ struct ceph_file_info {
>         int dir_info_len;
>  };
>
> +struct ceph_file_info {
> +       short fmode;     /* initialized on open */
> +       short flags;     /* CEPH_F_* */
> +
> +       spinlock_t rw_contexts_lock;
> +       struct list_head rw_contexts;
> +
> +       /* readdir related things */
> +       struct ceph_readdir_info *readdir_info;
> +};


For dir file. I prefer something like:

struct ceph_dir_file_info {
   struct ceph_file_info file;
   ceph_readdir_info readdir.
};

This avoids double memory allocation for dir file.

Regards
Yan, Zheng

> +
>  struct ceph_rw_context {
>         struct list_head list;
>         struct task_struct *thread;
> diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
> index c2ec44c..4ca23fa 100644
> --- a/include/linux/ceph/libceph.h
> +++ b/include/linux/ceph/libceph.h
> @@ -262,6 +262,7 @@ static inline int calc_pages_for(u64 off, u64 len)
>  extern struct kmem_cache *ceph_cap_flush_cachep;
>  extern struct kmem_cache *ceph_dentry_cachep;
>  extern struct kmem_cache *ceph_file_cachep;
> +extern struct kmem_cache *ceph_readdir_cachep;
>
>  /* ceph_common.c */
>  extern bool libceph_compatible(void *data);
> --
> 1.8.3.1
>
> --
> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Chengguang Xu March 6, 2018, 10:08 a.m. UTC | #2
> Sent: Tuesday, March 06, 2018 at 4:34 PM
> From: "Yan, Zheng" <ukernel@gmail.com>
> To: "Chengguang Xu" <cgxu519@gmx.com>
> Cc: "Zheng Yan" <zyan@redhat.com>, "Ilya Dryomov" <idryomov@gmail.com>, ceph-devel <ceph-devel@vger.kernel.org>
> Subject: Re: [PATCH] ceph: optimize memory usage
>
> On Mon, Mar 5, 2018 at 8:17 PM, Chengguang Xu <cgxu519@gmx.com> wrote:
> > In current code, regular file and directory use same struct
> > ceph_file_info to store fs specific data so the struct has to
> > include some fields which are only used for directory
> > (e.g., readdir related info), when having plenty of regular files,
> > it will lead to memory waste.
> >
> > This patch introduces dedicated readdir_info cache to separate readdir
> > related things from struct ceph_file_info, so that regular file could
> > not include those unused fields anymore. Also, chagned to manipulate
> > fscache only when reuglar file successfully acquired ceph_file_info.
> >
> > Signed-off-by: Chengguang Xu <cgxu519@gmx.com>
> > ---
> >  fs/ceph/dir.c                | 158 +++++++++++++++++++++++--------------------
> >  fs/ceph/file.c               |  88 ++++++++++++++++--------
> >  fs/ceph/super.c              |  14 +++-
> >  fs/ceph/super.h              |  19 ++++--
> >  include/linux/ceph/libceph.h |   1 +
> >  5 files changed, 170 insertions(+), 110 deletions(-)
> >
> > diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
> > index 0c43468..84bc75a 100644
> > --- a/fs/ceph/dir.c
> > +++ b/fs/ceph/dir.c
> > @@ -105,15 +105,17 @@ static int fpos_cmp(loff_t l, loff_t r)
> >  static int note_last_dentry(struct ceph_file_info *fi, const char *name,
> >                             int len, unsigned next_offset)
> >  {
> > +       struct ceph_readdir_info *ri = fi->readdir_info;
> >         char *buf = kmalloc(len+1, GFP_KERNEL);
> >         if (!buf)
> >                 return -ENOMEM;
> > -       kfree(fi->last_name);
> > -       fi->last_name = buf;
> > -       memcpy(fi->last_name, name, len);
> > -       fi->last_name[len] = 0;
> > -       fi->next_offset = next_offset;
> > -       dout("note_last_dentry '%s'\n", fi->last_name);
> > +
> > +       kfree(ri->last_name);
> > +       ri->last_name = buf;
> > +       memcpy(ri->last_name, name, len);
> > +       ri->last_name[len] = 0;
> > +       ri->next_offset = next_offset;
> > +       dout("note_last_dentry '%s'\n", ri->last_name);
> >         return 0;
> >  }
> >
> > @@ -176,6 +178,7 @@ static int __dcache_readdir(struct file *file,  struct dir_context *ctx,
> >                             int shared_gen)
> >  {
> >         struct ceph_file_info *fi = file->private_data;
> > +       struct ceph_readdir_info *ri = fi->readdir_info;
> >         struct dentry *parent = file->f_path.dentry;
> >         struct inode *dir = d_inode(parent);
> >         struct dentry *dentry, *last = NULL;
> > @@ -279,9 +282,9 @@ static int __dcache_readdir(struct file *file,  struct dir_context *ctx,
> >                         err = ret;
> >                 dput(last);
> >                 /* last_name no longer match cache index */
> > -               if (fi->readdir_cache_idx >= 0) {
> > -                       fi->readdir_cache_idx = -1;
> > -                       fi->dir_release_count = 0;
> > +               if (ri->readdir_cache_idx >= 0) {
> > +                       ri->readdir_cache_idx = -1;
> > +                       ri->dir_release_count = 0;
> >                 }
> >         }
> >         return err;
> > @@ -289,17 +292,20 @@ static int __dcache_readdir(struct file *file,  struct dir_context *ctx,
> >
> >  static bool need_send_readdir(struct ceph_file_info *fi, loff_t pos)
> >  {
> > -       if (!fi->last_readdir)
> > +       struct ceph_readdir_info *ri = fi->readdir_info;
> > +
> > +       if (!ri->last_readdir)
> >                 return true;
> >         if (is_hash_order(pos))
> > -               return !ceph_frag_contains_value(fi->frag, fpos_hash(pos));
> > +               return !ceph_frag_contains_value(ri->frag, fpos_hash(pos));
> >         else
> > -               return fi->frag != fpos_frag(pos);
> > +               return ri->frag != fpos_frag(pos);
> >  }
> >
> >  static int ceph_readdir(struct file *file, struct dir_context *ctx)
> >  {
> >         struct ceph_file_info *fi = file->private_data;
> > +       struct ceph_readdir_info *ri = fi->readdir_info;
> >         struct inode *inode = file_inode(file);
> >         struct ceph_inode_info *ci = ceph_inode(inode);
> >         struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
> > @@ -357,9 +363,9 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
> >                         CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR;
> >
> >                 /* discard old result, if any */
> > -               if (fi->last_readdir) {
> > -                       ceph_mdsc_put_request(fi->last_readdir);
> > -                       fi->last_readdir = NULL;
> > +               if (ri->last_readdir) {
> > +                       ceph_mdsc_put_request(ri->last_readdir);
> > +                       ri->last_readdir = NULL;
> >                 }
> >
> >                 if (is_hash_order(ctx->pos)) {
> > @@ -373,7 +379,7 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
> >                 }
> >
> >                 dout("readdir fetching %llx.%llx frag %x offset '%s'\n",
> > -                    ceph_vinop(inode), frag, fi->last_name);
> > +                    ceph_vinop(inode), frag, ri->last_name);
> >                 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
> >                 if (IS_ERR(req))
> >                         return PTR_ERR(req);
> > @@ -389,8 +395,8 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
> >                         __set_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags);
> >                         req->r_inode_drop = CEPH_CAP_FILE_EXCL;
> >                 }
> > -               if (fi->last_name) {
> > -                       req->r_path2 = kstrdup(fi->last_name, GFP_KERNEL);
> > +               if (ri->last_name) {
> > +                       req->r_path2 = kstrdup(ri->last_name, GFP_KERNEL);
> >                         if (!req->r_path2) {
> >                                 ceph_mdsc_put_request(req);
> >                                 return -ENOMEM;
> > @@ -400,10 +406,10 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
> >                                 cpu_to_le32(fpos_hash(ctx->pos));
> >                 }
> >
> > -               req->r_dir_release_cnt = fi->dir_release_count;
> > -               req->r_dir_ordered_cnt = fi->dir_ordered_count;
> > -               req->r_readdir_cache_idx = fi->readdir_cache_idx;
> > -               req->r_readdir_offset = fi->next_offset;
> > +               req->r_dir_release_cnt = ri->dir_release_count;
> > +               req->r_dir_ordered_cnt = ri->dir_ordered_count;
> > +               req->r_readdir_cache_idx = ri->readdir_cache_idx;
> > +               req->r_readdir_offset = ri->next_offset;
> >                 req->r_args.readdir.frag = cpu_to_le32(frag);
> >                 req->r_args.readdir.flags =
> >                                 cpu_to_le16(CEPH_READDIR_REPLY_BITFLAGS);
> > @@ -427,35 +433,35 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
> >                 if (le32_to_cpu(rinfo->dir_dir->frag) != frag) {
> >                         frag = le32_to_cpu(rinfo->dir_dir->frag);
> >                         if (!rinfo->hash_order) {
> > -                               fi->next_offset = req->r_readdir_offset;
> > +                               ri->next_offset = req->r_readdir_offset;
> >                                 /* adjust ctx->pos to beginning of frag */
> >                                 ctx->pos = ceph_make_fpos(frag,
> > -                                                         fi->next_offset,
> > +                                                         ri->next_offset,
> >                                                           false);
> >                         }
> >                 }
> >
> > -               fi->frag = frag;
> > -               fi->last_readdir = req;
> > +               ri->frag = frag;
> > +               ri->last_readdir = req;
> >
> >                 if (test_bit(CEPH_MDS_R_DID_PREPOPULATE, &req->r_req_flags)) {
> > -                       fi->readdir_cache_idx = req->r_readdir_cache_idx;
> > -                       if (fi->readdir_cache_idx < 0) {
> > +                       ri->readdir_cache_idx = req->r_readdir_cache_idx;
> > +                       if (ri->readdir_cache_idx < 0) {
> >                                 /* preclude from marking dir ordered */
> > -                               fi->dir_ordered_count = 0;
> > +                               ri->dir_ordered_count = 0;
> >                         } else if (ceph_frag_is_leftmost(frag) &&
> > -                                  fi->next_offset == 2) {
> > +                                  ri->next_offset == 2) {
> >                                 /* note dir version at start of readdir so
> >                                  * we can tell if any dentries get dropped */
> > -                               fi->dir_release_count = req->r_dir_release_cnt;
> > -                               fi->dir_ordered_count = req->r_dir_ordered_cnt;
> > +                               ri->dir_release_count = req->r_dir_release_cnt;
> > +                               ri->dir_ordered_count = req->r_dir_ordered_cnt;
> >                         }
> >                 } else {
> >                         dout("readdir !did_prepopulate");
> >                         /* disable readdir cache */
> > -                       fi->readdir_cache_idx = -1;
> > +                       ri->readdir_cache_idx = -1;
> >                         /* preclude from marking dir complete */
> > -                       fi->dir_release_count = 0;
> > +                       ri->dir_release_count = 0;
> >                 }
> >
> >                 /* note next offset and last dentry name */
> > @@ -469,14 +475,14 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
> >                         if (err)
> >                                 return err;
> >                 } else if (req->r_reply_info.dir_end) {
> > -                       fi->next_offset = 2;
> > +                       ri->next_offset = 2;
> >                         /* keep last name */
> >                 }
> >         }
> >
> > -       rinfo = &fi->last_readdir->r_reply_info;
> > +       rinfo = &ri->last_readdir->r_reply_info;
> >         dout("readdir frag %x num %d pos %llx chunk first %llx\n",
> > -            fi->frag, rinfo->dir_nr, ctx->pos,
> > +            ri->frag, rinfo->dir_nr, ctx->pos,
> >              rinfo->dir_nr ? rinfo->dir_entries[0].offset : 0LL);
> >
> >         i = 0;
> > @@ -520,27 +526,27 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
> >                 ctx->pos++;
> >         }
> >
> > -       ceph_mdsc_put_request(fi->last_readdir);
> > -       fi->last_readdir = NULL;
> > +       ceph_mdsc_put_request(ri->last_readdir);
> > +       ri->last_readdir = NULL;
> >
> > -       if (fi->next_offset > 2) {
> > -               frag = fi->frag;
> > +       if (ri->next_offset > 2) {
> > +               frag = ri->frag;
> >                 goto more;
> >         }
> >
> >         /* more frags? */
> > -       if (!ceph_frag_is_rightmost(fi->frag)) {
> > -               frag = ceph_frag_next(fi->frag);
> > +       if (!ceph_frag_is_rightmost(ri->frag)) {
> > +               frag = ceph_frag_next(ri->frag);
> >                 if (is_hash_order(ctx->pos)) {
> >                         loff_t new_pos = ceph_make_fpos(ceph_frag_value(frag),
> > -                                                       fi->next_offset, true);
> > +                                                       ri->next_offset, true);
> >                         if (new_pos > ctx->pos)
> >                                 ctx->pos = new_pos;
> >                         /* keep last_name */
> >                 } else {
> > -                       ctx->pos = ceph_make_fpos(frag, fi->next_offset, false);
> > -                       kfree(fi->last_name);
> > -                       fi->last_name = NULL;
> > +                       ctx->pos = ceph_make_fpos(frag, ri->next_offset, false);
> > +                       kfree(ri->last_name);
> > +                       ri->last_name = NULL;
> >                 }
> >                 dout("readdir next frag is %x\n", frag);
> >                 goto more;
> > @@ -552,20 +558,20 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
> >          * were released during the whole readdir, and we should have
> >          * the complete dir contents in our cache.
> >          */
> > -       if (atomic64_read(&ci->i_release_count) == fi->dir_release_count) {
> > +       if (atomic64_read(&ci->i_release_count) == ri->dir_release_count) {
> >                 spin_lock(&ci->i_ceph_lock);
> > -               if (fi->dir_ordered_count == atomic64_read(&ci->i_ordered_count)) {
> > +               if (ri->dir_ordered_count == atomic64_read(&ci->i_ordered_count)) {
> >                         dout(" marking %p complete and ordered\n", inode);
> >                         /* use i_size to track number of entries in
> >                          * readdir cache */
> > -                       BUG_ON(fi->readdir_cache_idx < 0);
> > -                       i_size_write(inode, fi->readdir_cache_idx *
> > +                       BUG_ON(ri->readdir_cache_idx < 0);
> > +                       i_size_write(inode, ri->readdir_cache_idx *
> >                                      sizeof(struct dentry*));
> >                 } else {
> >                         dout(" marking %p complete\n", inode);
> >                 }
> > -               __ceph_dir_set_complete(ci, fi->dir_release_count,
> > -                                       fi->dir_ordered_count);
> > +               __ceph_dir_set_complete(ci, ri->dir_release_count,
> > +                                       ri->dir_ordered_count);
> >                 spin_unlock(&ci->i_ceph_lock);
> >         }
> >
> > @@ -575,15 +581,17 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
> >
> >  static void reset_readdir(struct ceph_file_info *fi)
> >  {
> > -       if (fi->last_readdir) {
> > -               ceph_mdsc_put_request(fi->last_readdir);
> > -               fi->last_readdir = NULL;
> > +       struct ceph_readdir_info *ri = fi->readdir_info;
> > +
> > +       if (ri->last_readdir) {
> > +               ceph_mdsc_put_request(ri->last_readdir);
> > +               ri->last_readdir = NULL;
> >         }
> > -       kfree(fi->last_name);
> > -       fi->last_name = NULL;
> > -       fi->dir_release_count = 0;
> > -       fi->readdir_cache_idx = -1;
> > -       fi->next_offset = 2;  /* compensate for . and .. */
> > +       kfree(ri->last_name);
> > +       ri->last_name = NULL;
> > +       ri->dir_release_count = 0;
> > +       ri->readdir_cache_idx = -1;
> > +       ri->next_offset = 2;  /* compensate for . and .. */
> >         fi->flags &= ~CEPH_F_ATEND;
> >  }
> >
> > @@ -594,16 +602,18 @@ static void reset_readdir(struct ceph_file_info *fi)
> >  static bool need_reset_readdir(struct ceph_file_info *fi, loff_t new_pos)
> >  {
> >         struct ceph_mds_reply_info_parsed *rinfo;
> > +       struct ceph_readdir_info *ri = fi->readdir_info;
> >         loff_t chunk_offset;
> > +
> >         if (new_pos == 0)
> >                 return true;
> >         if (is_hash_order(new_pos)) {
> >                 /* no need to reset last_name for a forward seek when
> >                  * dentries are sotred in hash order */
> > -       } else if (fi->frag != fpos_frag(new_pos)) {
> > +       } else if (ri->frag != fpos_frag(new_pos)) {
> >                 return true;
> >         }
> > -       rinfo = fi->last_readdir ? &fi->last_readdir->r_reply_info : NULL;
> > +       rinfo = ri->last_readdir ? &ri->last_readdir->r_reply_info : NULL;
> >         if (!rinfo || !rinfo->dir_nr)
> >                 return true;
> >         chunk_offset = rinfo->dir_entries[0].offset;
> > @@ -614,6 +624,7 @@ static bool need_reset_readdir(struct ceph_file_info *fi, loff_t new_pos)
> >  static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
> >  {
> >         struct ceph_file_info *fi = file->private_data;
> > +       struct ceph_readdir_info *ri = fi->readdir_info;
> >         struct inode *inode = file->f_mapping->host;
> >         loff_t retval;
> >
> > @@ -637,8 +648,8 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
> >                 } else if (is_hash_order(offset) && offset > file->f_pos) {
> >                         /* for hash offset, we don't know if a forward seek
> >                          * is within same frag */
> > -                       fi->dir_release_count = 0;
> > -                       fi->readdir_cache_idx = -1;
> > +                       ri->dir_release_count = 0;
> > +                       ri->readdir_cache_idx = -1;
> >                 }
> >
> >                 if (offset != file->f_pos) {
> > @@ -1371,6 +1382,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
> >                              loff_t *ppos)
> >  {
> >         struct ceph_file_info *cf = file->private_data;
> > +       struct ceph_readdir_info *ri = cf->readdir_info;
> >         struct inode *inode = file_inode(file);
> >         struct ceph_inode_info *ci = ceph_inode(inode);
> >         int left;
> > @@ -1379,12 +1391,12 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
> >         if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT))
> >                 return -EISDIR;
> >
> > -       if (!cf->dir_info) {
> > -               cf->dir_info = kmalloc(bufsize, GFP_KERNEL);
> > -               if (!cf->dir_info)
> > +       if (!ri->dir_info) {
> > +               ri->dir_info = kmalloc(bufsize, GFP_KERNEL);
> > +               if (!ri->dir_info)
> >                         return -ENOMEM;
> > -               cf->dir_info_len =
> > -                       snprintf(cf->dir_info, bufsize,
> > +               ri->dir_info_len =
> > +                       snprintf(ri->dir_info, bufsize,
> >                                 "entries:   %20lld\n"
> >                                 " files:    %20lld\n"
> >                                 " subdirs:  %20lld\n"
> > @@ -1404,10 +1416,10 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
> >                                 (long)ci->i_rctime.tv_nsec);
> >         }
> >
> > -       if (*ppos >= cf->dir_info_len)
> > +       if (*ppos >= ri->dir_info_len)
> >                 return 0;
> > -       size = min_t(unsigned, size, cf->dir_info_len-*ppos);
> > -       left = copy_to_user(buf, cf->dir_info + *ppos, size);
> > +       size = min_t(unsigned int, size, ri->dir_info_len-*ppos);
> > +       left = copy_to_user(buf, ri->dir_info + *ppos, size);
> >         if (left == size)
> >                 return -EFAULT;
> >         *ppos += (size - left);
> > diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> > index 6639926..611aaef 100644
> > --- a/fs/ceph/file.c
> > +++ b/fs/ceph/file.c
> > @@ -159,36 +159,60 @@ static size_t dio_get_pagev_size(const struct iov_iter *it)
> >         return req;
> >  }
> >
> > +
> > +static int ceph_init_file_info(struct inode *inode,
> > +                       struct file *file, int fmode)
> > +{
> > +       struct ceph_file_info *fi;
> > +
> > +       dout("init_file inode %p file %p mode 0%o (%s)\n", inode, file,
> > +               inode->i_mode, S_ISREG(inode->i_mode) ? "regular" : "dir");
> > +
> > +       BUG_ON(inode->i_fop->release != ceph_release);
> > +
> > +       fi = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL);
> > +       if (!fi) {
> > +               ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
> > +               return -ENOMEM;
> > +       }
> > +
> > +       if (S_ISDIR(inode->i_mode)) {
> > +               fi->readdir_info = kmem_cache_zalloc(ceph_readdir_cachep,
> > +                                                       GFP_KERNEL);
> > +               if (!fi->readdir_info) {
> > +                       kmem_cache_free(ceph_file_cachep, fi);
> > +                       return -ENOMEM;
> > +               }
> > +
> > +               fi->readdir_info->next_offset = 2;
> > +               fi->readdir_info->readdir_cache_idx = -1;
> > +       } else {
> > +               ceph_fscache_register_inode_cookie(inode);
> > +               ceph_fscache_file_set_cookie(inode, file);
> > +       }
> > +
> > +       fi->fmode = fmode;
> > +       spin_lock_init(&fi->rw_contexts_lock);
> > +       INIT_LIST_HEAD(&fi->rw_contexts);
> > +       file->private_data = fi;
> > +
> > +       return 0;
> > +}
> > +
> >  /*
> >   * initialize private struct file data.
> >   * if we fail, clean up by dropping fmode reference on the ceph_inode
> >   */
> >  static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
> >  {
> > -       struct ceph_file_info *cf;
> >         int ret = 0;
> >
> >         switch (inode->i_mode & S_IFMT) {
> >         case S_IFREG:
> > -               ceph_fscache_register_inode_cookie(inode);
> > -               ceph_fscache_file_set_cookie(inode, file);
> >         case S_IFDIR:
> > -               dout("init_file %p %p 0%o (regular)\n", inode, file,
> > -                    inode->i_mode);
> > -               cf = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL);
> > -               if (!cf) {
> > -                       ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
> > -                       return -ENOMEM;
> > -               }
> > -               cf->fmode = fmode;
> > -
> > -               spin_lock_init(&cf->rw_contexts_lock);
> > -               INIT_LIST_HEAD(&cf->rw_contexts);
> > -
> > -               cf->next_offset = 2;
> > -               cf->readdir_cache_idx = -1;
> > -               file->private_data = cf;
> > -               BUG_ON(inode->i_fop->release != ceph_release);
> > +               ret = ceph_init_file_info(inode, file, fmode);
> > +               if (ret)
> > +                       return ret;
> >                 break;
> >
> >         case S_IFLNK:
> > @@ -460,16 +484,24 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
> >  int ceph_release(struct inode *inode, struct file *file)
> >  {
> >         struct ceph_inode_info *ci = ceph_inode(inode);
> > -       struct ceph_file_info *cf = file->private_data;
> > +       struct ceph_file_info *fi = file->private_data;
> > +       struct ceph_readdir_info *ri = fi->readdir_info;
> > +
> > +       dout("release inode %p file %p mode 0%o (%s)\n", inode, file,
> > +               inode->i_mode, S_ISREG(inode->i_mode) ? "regular" : "dir");
> >
> > -       dout("release inode %p file %p\n", inode, file);
> > -       ceph_put_fmode(ci, cf->fmode);
> > -       if (cf->last_readdir)
> > -               ceph_mdsc_put_request(cf->last_readdir);
> > -       kfree(cf->last_name);
> > -       kfree(cf->dir_info);
> > -       WARN_ON(!list_empty(&cf->rw_contexts));
> > -       kmem_cache_free(ceph_file_cachep, cf);
> > +       WARN_ON(!list_empty(&fi->rw_contexts));
> > +
> > +       ceph_put_fmode(ci, fi->fmode);
> > +       if (ri) {
> > +               if (ri->last_readdir)
> > +                       ceph_mdsc_put_request(ri->last_readdir);
> > +               kfree(ri->last_name);
> > +               kfree(ri->dir_info);
> > +               kmem_cache_free(ceph_readdir_cachep, ri);
> > +               ri = NULL;
> > +       }
> > +       kmem_cache_free(ceph_file_cachep, fi);
> >
> >         /* wake up anyone waiting for caps on this inode */
> >         wake_up_all(&ci->i_cap_wq);
> > diff --git a/fs/ceph/super.c b/fs/ceph/super.c
> > index a62d2a9..c1732ff 100644
> > --- a/fs/ceph/super.c
> > +++ b/fs/ceph/super.c
> > @@ -677,6 +677,7 @@ static void destroy_fs_client(struct ceph_fs_client *fsc)
> >  struct kmem_cache *ceph_cap_flush_cachep;
> >  struct kmem_cache *ceph_dentry_cachep;
> >  struct kmem_cache *ceph_file_cachep;
> > +struct kmem_cache *ceph_readdir_cachep;
> >
> >  static void ceph_inode_init_once(void *foo)
> >  {
> > @@ -711,14 +712,22 @@ static int __init init_caches(void)
> >                 goto bad_dentry;
> >
> >         ceph_file_cachep = KMEM_CACHE(ceph_file_info, SLAB_MEM_SPREAD);
> > -
> >         if (!ceph_file_cachep)
> >                 goto bad_file;
> >
> > +       ceph_readdir_cachep = KMEM_CACHE(ceph_readdir_info, SLAB_MEM_SPREAD);
> > +       if (!ceph_readdir_cachep)
> > +               goto bad_readdir;
> > +
> >         if ((error = ceph_fscache_register()))
> > -               goto bad_file;
> > +               goto bad_fscache;
> >
> >         return 0;
> > +
> > +bad_fscache:
> > +       kmem_cache_destroy(ceph_readdir_cachep);
> > +bad_readdir:
> > +       kmem_cache_destroy(ceph_file_cachep);
> >  bad_file:
> >         kmem_cache_destroy(ceph_dentry_cachep);
> >  bad_dentry:
> > @@ -743,6 +752,7 @@ static void destroy_caches(void)
> >         kmem_cache_destroy(ceph_cap_flush_cachep);
> >         kmem_cache_destroy(ceph_dentry_cachep);
> >         kmem_cache_destroy(ceph_file_cachep);
> > +       kmem_cache_destroy(ceph_readdir_cachep);
> >
> >         ceph_fscache_unregister();
> >  }
> > diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> > index 21b2e5b..3a943a0 100644
> > --- a/fs/ceph/super.h
> > +++ b/fs/ceph/super.h
> > @@ -665,13 +665,7 @@ extern void ceph_reservation_status(struct ceph_fs_client *client,
> >  #define CEPH_F_SYNC     1
> >  #define CEPH_F_ATEND    2
> >
> > -struct ceph_file_info {
> > -       short fmode;     /* initialized on open */
> > -       short flags;     /* CEPH_F_* */
> > -
> > -       spinlock_t rw_contexts_lock;
> > -       struct list_head rw_contexts;
> > -
> > +struct ceph_readdir_info {
> >         /* readdir: position within the dir */
> >         u32 frag;
> >         struct ceph_mds_request *last_readdir;
> > @@ -688,6 +682,17 @@ struct ceph_file_info {
> >         int dir_info_len;
> >  };
> >
> > +struct ceph_file_info {
> > +       short fmode;     /* initialized on open */
> > +       short flags;     /* CEPH_F_* */
> > +
> > +       spinlock_t rw_contexts_lock;
> > +       struct list_head rw_contexts;
> > +
> > +       /* readdir related things */
> > +       struct ceph_readdir_info *readdir_info;
> > +};
> 
> 
> For dir file. I prefer something like:
> 
> struct ceph_dir_file_info {
>    struct ceph_file_info file;
>    ceph_readdir_info readdir.
> };
> 
> This avoids double memory allocation for dir file.

That makes sense. The only reason I keep ceph_file_info as
common interface of regular file and dir is for voiding 
detection of file type in many calling places. But if you
think it's worth, I can modify like that.

Thanks,
Chengguang.

> > +
> >  struct ceph_rw_context {
> >         struct list_head list;
> >         struct task_struct *thread;
> > diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
> > index c2ec44c..4ca23fa 100644
> > --- a/include/linux/ceph/libceph.h
> > +++ b/include/linux/ceph/libceph.h
> > @@ -262,6 +262,7 @@ static inline int calc_pages_for(u64 off, u64 len)
> >  extern struct kmem_cache *ceph_cap_flush_cachep;
> >  extern struct kmem_cache *ceph_dentry_cachep;
> >  extern struct kmem_cache *ceph_file_cachep;
> > +extern struct kmem_cache *ceph_readdir_cachep;
> >
> >  /* ceph_common.c */
> >  extern bool libceph_compatible(void *data);
> > --
> > 1.8.3.1
> >
> > --
> > To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 0c43468..84bc75a 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -105,15 +105,17 @@  static int fpos_cmp(loff_t l, loff_t r)
 static int note_last_dentry(struct ceph_file_info *fi, const char *name,
 		            int len, unsigned next_offset)
 {
+	struct ceph_readdir_info *ri = fi->readdir_info;
 	char *buf = kmalloc(len+1, GFP_KERNEL);
 	if (!buf)
 		return -ENOMEM;
-	kfree(fi->last_name);
-	fi->last_name = buf;
-	memcpy(fi->last_name, name, len);
-	fi->last_name[len] = 0;
-	fi->next_offset = next_offset;
-	dout("note_last_dentry '%s'\n", fi->last_name);
+
+	kfree(ri->last_name);
+	ri->last_name = buf;
+	memcpy(ri->last_name, name, len);
+	ri->last_name[len] = 0;
+	ri->next_offset = next_offset;
+	dout("note_last_dentry '%s'\n", ri->last_name);
 	return 0;
 }
 
@@ -176,6 +178,7 @@  static int __dcache_readdir(struct file *file,  struct dir_context *ctx,
 			    int shared_gen)
 {
 	struct ceph_file_info *fi = file->private_data;
+	struct ceph_readdir_info *ri = fi->readdir_info;
 	struct dentry *parent = file->f_path.dentry;
 	struct inode *dir = d_inode(parent);
 	struct dentry *dentry, *last = NULL;
@@ -279,9 +282,9 @@  static int __dcache_readdir(struct file *file,  struct dir_context *ctx,
 			err = ret;
 		dput(last);
 		/* last_name no longer match cache index */
-		if (fi->readdir_cache_idx >= 0) {
-			fi->readdir_cache_idx = -1;
-			fi->dir_release_count = 0;
+		if (ri->readdir_cache_idx >= 0) {
+			ri->readdir_cache_idx = -1;
+			ri->dir_release_count = 0;
 		}
 	}
 	return err;
@@ -289,17 +292,20 @@  static int __dcache_readdir(struct file *file,  struct dir_context *ctx,
 
 static bool need_send_readdir(struct ceph_file_info *fi, loff_t pos)
 {
-	if (!fi->last_readdir)
+	struct ceph_readdir_info *ri = fi->readdir_info;
+
+	if (!ri->last_readdir)
 		return true;
 	if (is_hash_order(pos))
-		return !ceph_frag_contains_value(fi->frag, fpos_hash(pos));
+		return !ceph_frag_contains_value(ri->frag, fpos_hash(pos));
 	else
-		return fi->frag != fpos_frag(pos);
+		return ri->frag != fpos_frag(pos);
 }
 
 static int ceph_readdir(struct file *file, struct dir_context *ctx)
 {
 	struct ceph_file_info *fi = file->private_data;
+	struct ceph_readdir_info *ri = fi->readdir_info;
 	struct inode *inode = file_inode(file);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
@@ -357,9 +363,9 @@  static int ceph_readdir(struct file *file, struct dir_context *ctx)
 			CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR;
 
 		/* discard old result, if any */
-		if (fi->last_readdir) {
-			ceph_mdsc_put_request(fi->last_readdir);
-			fi->last_readdir = NULL;
+		if (ri->last_readdir) {
+			ceph_mdsc_put_request(ri->last_readdir);
+			ri->last_readdir = NULL;
 		}
 
 		if (is_hash_order(ctx->pos)) {
@@ -373,7 +379,7 @@  static int ceph_readdir(struct file *file, struct dir_context *ctx)
 		}
 
 		dout("readdir fetching %llx.%llx frag %x offset '%s'\n",
-		     ceph_vinop(inode), frag, fi->last_name);
+		     ceph_vinop(inode), frag, ri->last_name);
 		req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
 		if (IS_ERR(req))
 			return PTR_ERR(req);
@@ -389,8 +395,8 @@  static int ceph_readdir(struct file *file, struct dir_context *ctx)
 			__set_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags);
 			req->r_inode_drop = CEPH_CAP_FILE_EXCL;
 		}
-		if (fi->last_name) {
-			req->r_path2 = kstrdup(fi->last_name, GFP_KERNEL);
+		if (ri->last_name) {
+			req->r_path2 = kstrdup(ri->last_name, GFP_KERNEL);
 			if (!req->r_path2) {
 				ceph_mdsc_put_request(req);
 				return -ENOMEM;
@@ -400,10 +406,10 @@  static int ceph_readdir(struct file *file, struct dir_context *ctx)
 				cpu_to_le32(fpos_hash(ctx->pos));
 		}
 
-		req->r_dir_release_cnt = fi->dir_release_count;
-		req->r_dir_ordered_cnt = fi->dir_ordered_count;
-		req->r_readdir_cache_idx = fi->readdir_cache_idx;
-		req->r_readdir_offset = fi->next_offset;
+		req->r_dir_release_cnt = ri->dir_release_count;
+		req->r_dir_ordered_cnt = ri->dir_ordered_count;
+		req->r_readdir_cache_idx = ri->readdir_cache_idx;
+		req->r_readdir_offset = ri->next_offset;
 		req->r_args.readdir.frag = cpu_to_le32(frag);
 		req->r_args.readdir.flags =
 				cpu_to_le16(CEPH_READDIR_REPLY_BITFLAGS);
@@ -427,35 +433,35 @@  static int ceph_readdir(struct file *file, struct dir_context *ctx)
 		if (le32_to_cpu(rinfo->dir_dir->frag) != frag) {
 			frag = le32_to_cpu(rinfo->dir_dir->frag);
 			if (!rinfo->hash_order) {
-				fi->next_offset = req->r_readdir_offset;
+				ri->next_offset = req->r_readdir_offset;
 				/* adjust ctx->pos to beginning of frag */
 				ctx->pos = ceph_make_fpos(frag,
-							  fi->next_offset,
+							  ri->next_offset,
 							  false);
 			}
 		}
 
-		fi->frag = frag;
-		fi->last_readdir = req;
+		ri->frag = frag;
+		ri->last_readdir = req;
 
 		if (test_bit(CEPH_MDS_R_DID_PREPOPULATE, &req->r_req_flags)) {
-			fi->readdir_cache_idx = req->r_readdir_cache_idx;
-			if (fi->readdir_cache_idx < 0) {
+			ri->readdir_cache_idx = req->r_readdir_cache_idx;
+			if (ri->readdir_cache_idx < 0) {
 				/* preclude from marking dir ordered */
-				fi->dir_ordered_count = 0;
+				ri->dir_ordered_count = 0;
 			} else if (ceph_frag_is_leftmost(frag) &&
-				   fi->next_offset == 2) {
+				   ri->next_offset == 2) {
 				/* note dir version at start of readdir so
 				 * we can tell if any dentries get dropped */
-				fi->dir_release_count = req->r_dir_release_cnt;
-				fi->dir_ordered_count = req->r_dir_ordered_cnt;
+				ri->dir_release_count = req->r_dir_release_cnt;
+				ri->dir_ordered_count = req->r_dir_ordered_cnt;
 			}
 		} else {
 			dout("readdir !did_prepopulate");
 			/* disable readdir cache */
-			fi->readdir_cache_idx = -1;
+			ri->readdir_cache_idx = -1;
 			/* preclude from marking dir complete */
-			fi->dir_release_count = 0;
+			ri->dir_release_count = 0;
 		}
 
 		/* note next offset and last dentry name */
@@ -469,14 +475,14 @@  static int ceph_readdir(struct file *file, struct dir_context *ctx)
 			if (err)
 				return err;
 		} else if (req->r_reply_info.dir_end) {
-			fi->next_offset = 2;
+			ri->next_offset = 2;
 			/* keep last name */
 		}
 	}
 
-	rinfo = &fi->last_readdir->r_reply_info;
+	rinfo = &ri->last_readdir->r_reply_info;
 	dout("readdir frag %x num %d pos %llx chunk first %llx\n",
-	     fi->frag, rinfo->dir_nr, ctx->pos,
+	     ri->frag, rinfo->dir_nr, ctx->pos,
 	     rinfo->dir_nr ? rinfo->dir_entries[0].offset : 0LL);
 
 	i = 0;
@@ -520,27 +526,27 @@  static int ceph_readdir(struct file *file, struct dir_context *ctx)
 		ctx->pos++;
 	}
 
-	ceph_mdsc_put_request(fi->last_readdir);
-	fi->last_readdir = NULL;
+	ceph_mdsc_put_request(ri->last_readdir);
+	ri->last_readdir = NULL;
 
-	if (fi->next_offset > 2) {
-		frag = fi->frag;
+	if (ri->next_offset > 2) {
+		frag = ri->frag;
 		goto more;
 	}
 
 	/* more frags? */
-	if (!ceph_frag_is_rightmost(fi->frag)) {
-		frag = ceph_frag_next(fi->frag);
+	if (!ceph_frag_is_rightmost(ri->frag)) {
+		frag = ceph_frag_next(ri->frag);
 		if (is_hash_order(ctx->pos)) {
 			loff_t new_pos = ceph_make_fpos(ceph_frag_value(frag),
-							fi->next_offset, true);
+							ri->next_offset, true);
 			if (new_pos > ctx->pos)
 				ctx->pos = new_pos;
 			/* keep last_name */
 		} else {
-			ctx->pos = ceph_make_fpos(frag, fi->next_offset, false);
-			kfree(fi->last_name);
-			fi->last_name = NULL;
+			ctx->pos = ceph_make_fpos(frag, ri->next_offset, false);
+			kfree(ri->last_name);
+			ri->last_name = NULL;
 		}
 		dout("readdir next frag is %x\n", frag);
 		goto more;
@@ -552,20 +558,20 @@  static int ceph_readdir(struct file *file, struct dir_context *ctx)
 	 * were released during the whole readdir, and we should have
 	 * the complete dir contents in our cache.
 	 */
-	if (atomic64_read(&ci->i_release_count) == fi->dir_release_count) {
+	if (atomic64_read(&ci->i_release_count) == ri->dir_release_count) {
 		spin_lock(&ci->i_ceph_lock);
-		if (fi->dir_ordered_count == atomic64_read(&ci->i_ordered_count)) {
+		if (ri->dir_ordered_count == atomic64_read(&ci->i_ordered_count)) {
 			dout(" marking %p complete and ordered\n", inode);
 			/* use i_size to track number of entries in
 			 * readdir cache */
-			BUG_ON(fi->readdir_cache_idx < 0);
-			i_size_write(inode, fi->readdir_cache_idx *
+			BUG_ON(ri->readdir_cache_idx < 0);
+			i_size_write(inode, ri->readdir_cache_idx *
 				     sizeof(struct dentry*));
 		} else {
 			dout(" marking %p complete\n", inode);
 		}
-		__ceph_dir_set_complete(ci, fi->dir_release_count,
-					fi->dir_ordered_count);
+		__ceph_dir_set_complete(ci, ri->dir_release_count,
+					ri->dir_ordered_count);
 		spin_unlock(&ci->i_ceph_lock);
 	}
 
@@ -575,15 +581,17 @@  static int ceph_readdir(struct file *file, struct dir_context *ctx)
 
 static void reset_readdir(struct ceph_file_info *fi)
 {
-	if (fi->last_readdir) {
-		ceph_mdsc_put_request(fi->last_readdir);
-		fi->last_readdir = NULL;
+	struct ceph_readdir_info *ri = fi->readdir_info;
+
+	if (ri->last_readdir) {
+		ceph_mdsc_put_request(ri->last_readdir);
+		ri->last_readdir = NULL;
 	}
-	kfree(fi->last_name);
-	fi->last_name = NULL;
-	fi->dir_release_count = 0;
-	fi->readdir_cache_idx = -1;
-	fi->next_offset = 2;  /* compensate for . and .. */
+	kfree(ri->last_name);
+	ri->last_name = NULL;
+	ri->dir_release_count = 0;
+	ri->readdir_cache_idx = -1;
+	ri->next_offset = 2;  /* compensate for . and .. */
 	fi->flags &= ~CEPH_F_ATEND;
 }
 
@@ -594,16 +602,18 @@  static void reset_readdir(struct ceph_file_info *fi)
 static bool need_reset_readdir(struct ceph_file_info *fi, loff_t new_pos)
 {
 	struct ceph_mds_reply_info_parsed *rinfo;
+	struct ceph_readdir_info *ri = fi->readdir_info;
 	loff_t chunk_offset;
+
 	if (new_pos == 0)
 		return true;
 	if (is_hash_order(new_pos)) {
 		/* no need to reset last_name for a forward seek when
 		 * dentries are sotred in hash order */
-	} else if (fi->frag != fpos_frag(new_pos)) {
+	} else if (ri->frag != fpos_frag(new_pos)) {
 		return true;
 	}
-	rinfo = fi->last_readdir ? &fi->last_readdir->r_reply_info : NULL;
+	rinfo = ri->last_readdir ? &ri->last_readdir->r_reply_info : NULL;
 	if (!rinfo || !rinfo->dir_nr)
 		return true;
 	chunk_offset = rinfo->dir_entries[0].offset;
@@ -614,6 +624,7 @@  static bool need_reset_readdir(struct ceph_file_info *fi, loff_t new_pos)
 static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
 {
 	struct ceph_file_info *fi = file->private_data;
+	struct ceph_readdir_info *ri = fi->readdir_info;
 	struct inode *inode = file->f_mapping->host;
 	loff_t retval;
 
@@ -637,8 +648,8 @@  static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
 		} else if (is_hash_order(offset) && offset > file->f_pos) {
 			/* for hash offset, we don't know if a forward seek
 			 * is within same frag */
-			fi->dir_release_count = 0;
-			fi->readdir_cache_idx = -1;
+			ri->dir_release_count = 0;
+			ri->readdir_cache_idx = -1;
 		}
 
 		if (offset != file->f_pos) {
@@ -1371,6 +1382,7 @@  static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
 			     loff_t *ppos)
 {
 	struct ceph_file_info *cf = file->private_data;
+	struct ceph_readdir_info *ri = cf->readdir_info;
 	struct inode *inode = file_inode(file);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	int left;
@@ -1379,12 +1391,12 @@  static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
 	if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT))
 		return -EISDIR;
 
-	if (!cf->dir_info) {
-		cf->dir_info = kmalloc(bufsize, GFP_KERNEL);
-		if (!cf->dir_info)
+	if (!ri->dir_info) {
+		ri->dir_info = kmalloc(bufsize, GFP_KERNEL);
+		if (!ri->dir_info)
 			return -ENOMEM;
-		cf->dir_info_len =
-			snprintf(cf->dir_info, bufsize,
+		ri->dir_info_len =
+			snprintf(ri->dir_info, bufsize,
 				"entries:   %20lld\n"
 				" files:    %20lld\n"
 				" subdirs:  %20lld\n"
@@ -1404,10 +1416,10 @@  static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
 				(long)ci->i_rctime.tv_nsec);
 	}
 
-	if (*ppos >= cf->dir_info_len)
+	if (*ppos >= ri->dir_info_len)
 		return 0;
-	size = min_t(unsigned, size, cf->dir_info_len-*ppos);
-	left = copy_to_user(buf, cf->dir_info + *ppos, size);
+	size = min_t(unsigned int, size, ri->dir_info_len-*ppos);
+	left = copy_to_user(buf, ri->dir_info + *ppos, size);
 	if (left == size)
 		return -EFAULT;
 	*ppos += (size - left);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 6639926..611aaef 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -159,36 +159,60 @@  static size_t dio_get_pagev_size(const struct iov_iter *it)
 	return req;
 }
 
+
+static int ceph_init_file_info(struct inode *inode,
+			struct file *file, int fmode)
+{
+	struct ceph_file_info *fi;
+
+	dout("init_file inode %p file %p mode 0%o (%s)\n", inode, file,
+		inode->i_mode, S_ISREG(inode->i_mode) ? "regular" : "dir");
+
+	BUG_ON(inode->i_fop->release != ceph_release);
+
+	fi = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL);
+	if (!fi) {
+		ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
+		return -ENOMEM;
+	}
+
+	if (S_ISDIR(inode->i_mode)) {
+		fi->readdir_info = kmem_cache_zalloc(ceph_readdir_cachep,
+							GFP_KERNEL);
+		if (!fi->readdir_info) {
+			kmem_cache_free(ceph_file_cachep, fi);
+			return -ENOMEM;
+		}
+
+		fi->readdir_info->next_offset = 2;
+		fi->readdir_info->readdir_cache_idx = -1;
+	} else {
+		ceph_fscache_register_inode_cookie(inode);
+		ceph_fscache_file_set_cookie(inode, file);
+	}
+
+	fi->fmode = fmode;
+	spin_lock_init(&fi->rw_contexts_lock);
+	INIT_LIST_HEAD(&fi->rw_contexts);
+	file->private_data = fi;
+
+	return 0;
+}
+
 /*
  * initialize private struct file data.
  * if we fail, clean up by dropping fmode reference on the ceph_inode
  */
 static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
 {
-	struct ceph_file_info *cf;
 	int ret = 0;
 
 	switch (inode->i_mode & S_IFMT) {
 	case S_IFREG:
-		ceph_fscache_register_inode_cookie(inode);
-		ceph_fscache_file_set_cookie(inode, file);
 	case S_IFDIR:
-		dout("init_file %p %p 0%o (regular)\n", inode, file,
-		     inode->i_mode);
-		cf = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL);
-		if (!cf) {
-			ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
-			return -ENOMEM;
-		}
-		cf->fmode = fmode;
-
-		spin_lock_init(&cf->rw_contexts_lock);
-		INIT_LIST_HEAD(&cf->rw_contexts);
-
-		cf->next_offset = 2;
-		cf->readdir_cache_idx = -1;
-		file->private_data = cf;
-		BUG_ON(inode->i_fop->release != ceph_release);
+		ret = ceph_init_file_info(inode, file, fmode);
+		if (ret)
+			return ret;
 		break;
 
 	case S_IFLNK:
@@ -460,16 +484,24 @@  int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
 int ceph_release(struct inode *inode, struct file *file)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_file_info *cf = file->private_data;
+	struct ceph_file_info *fi = file->private_data;
+	struct ceph_readdir_info *ri = fi->readdir_info;
+
+	dout("release inode %p file %p mode 0%o (%s)\n", inode, file,
+		inode->i_mode, S_ISREG(inode->i_mode) ? "regular" : "dir");
 
-	dout("release inode %p file %p\n", inode, file);
-	ceph_put_fmode(ci, cf->fmode);
-	if (cf->last_readdir)
-		ceph_mdsc_put_request(cf->last_readdir);
-	kfree(cf->last_name);
-	kfree(cf->dir_info);
-	WARN_ON(!list_empty(&cf->rw_contexts));
-	kmem_cache_free(ceph_file_cachep, cf);
+	WARN_ON(!list_empty(&fi->rw_contexts));
+
+	ceph_put_fmode(ci, fi->fmode);
+	if (ri) {
+		if (ri->last_readdir)
+			ceph_mdsc_put_request(ri->last_readdir);
+		kfree(ri->last_name);
+		kfree(ri->dir_info);
+		kmem_cache_free(ceph_readdir_cachep, ri);
+		ri = NULL;
+	}
+	kmem_cache_free(ceph_file_cachep, fi);
 
 	/* wake up anyone waiting for caps on this inode */
 	wake_up_all(&ci->i_cap_wq);
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index a62d2a9..c1732ff 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -677,6 +677,7 @@  static void destroy_fs_client(struct ceph_fs_client *fsc)
 struct kmem_cache *ceph_cap_flush_cachep;
 struct kmem_cache *ceph_dentry_cachep;
 struct kmem_cache *ceph_file_cachep;
+struct kmem_cache *ceph_readdir_cachep;
 
 static void ceph_inode_init_once(void *foo)
 {
@@ -711,14 +712,22 @@  static int __init init_caches(void)
 		goto bad_dentry;
 
 	ceph_file_cachep = KMEM_CACHE(ceph_file_info, SLAB_MEM_SPREAD);
-
 	if (!ceph_file_cachep)
 		goto bad_file;
 
+	ceph_readdir_cachep = KMEM_CACHE(ceph_readdir_info, SLAB_MEM_SPREAD);
+	if (!ceph_readdir_cachep)
+		goto bad_readdir;
+
 	if ((error = ceph_fscache_register()))
-		goto bad_file;
+		goto bad_fscache;
 
 	return 0;
+
+bad_fscache:
+	kmem_cache_destroy(ceph_readdir_cachep);
+bad_readdir:
+	kmem_cache_destroy(ceph_file_cachep);
 bad_file:
 	kmem_cache_destroy(ceph_dentry_cachep);
 bad_dentry:
@@ -743,6 +752,7 @@  static void destroy_caches(void)
 	kmem_cache_destroy(ceph_cap_flush_cachep);
 	kmem_cache_destroy(ceph_dentry_cachep);
 	kmem_cache_destroy(ceph_file_cachep);
+	kmem_cache_destroy(ceph_readdir_cachep);
 
 	ceph_fscache_unregister();
 }
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 21b2e5b..3a943a0 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -665,13 +665,7 @@  extern void ceph_reservation_status(struct ceph_fs_client *client,
 #define CEPH_F_SYNC     1
 #define CEPH_F_ATEND    2
 
-struct ceph_file_info {
-	short fmode;     /* initialized on open */
-	short flags;     /* CEPH_F_* */
-
-	spinlock_t rw_contexts_lock;
-	struct list_head rw_contexts;
-
+struct ceph_readdir_info {
 	/* readdir: position within the dir */
 	u32 frag;
 	struct ceph_mds_request *last_readdir;
@@ -688,6 +682,17 @@  struct ceph_file_info {
 	int dir_info_len;
 };
 
+struct ceph_file_info {
+	short fmode;     /* initialized on open */
+	short flags;     /* CEPH_F_* */
+
+	spinlock_t rw_contexts_lock;
+	struct list_head rw_contexts;
+
+	/* readdir related things */
+	struct ceph_readdir_info *readdir_info;
+};
+
 struct ceph_rw_context {
 	struct list_head list;
 	struct task_struct *thread;
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index c2ec44c..4ca23fa 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -262,6 +262,7 @@  static inline int calc_pages_for(u64 off, u64 len)
 extern struct kmem_cache *ceph_cap_flush_cachep;
 extern struct kmem_cache *ceph_dentry_cachep;
 extern struct kmem_cache *ceph_file_cachep;
+extern struct kmem_cache *ceph_readdir_cachep;
 
 /* ceph_common.c */
 extern bool libceph_compatible(void *data);