diff mbox series

[v5,1/2] tmpfs: Add per-superblock i_ino support

Message ID 91b4ed6727712cb6d426cf60c740fe2f473f7638.1578225806.git.chris@chrisdown.name (mailing list archive)
State New, archived
Headers show
Series fs: inode: shmem: Reduce risk of inum overflow | expand

Commit Message

Chris Down Jan. 5, 2020, 12:06 p.m. UTC
get_next_ino has a number of problems:

- It uses and returns a uint, which is susceptible to become overflowed
  if a lot of volatile inodes that use get_next_ino are created.
- It's global, with no specificity per-sb or even per-filesystem. This
  means it's not that difficult to cause inode number wraparounds on a
  single device, which can result in having multiple distinct inodes
  with the same inode number.

This patch adds a per-superblock counter that mitigates the second case.
This design also allows us to later have a specific i_ino size
per-device, for example, allowing users to choose whether to use 32- or
64-bit inodes for each tmpfs mount. This is implemented in the next
commit.

Signed-off-by: Chris Down <chris@chrisdown.name>
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Jeff Layton <jlayton@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: linux-mm@kvack.org
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: kernel-team@fb.com
---
 include/linux/shmem_fs.h |  1 +
 mm/shmem.c               | 30 +++++++++++++++++++++++++++++-
 2 files changed, 30 insertions(+), 1 deletion(-)

v5: Nothing in code, just resending with correct linux-mm domain.

Comments

Zheng Bin Jan. 6, 2020, 2:03 a.m. UTC | #1
On 2020/1/5 20:06, Chris Down wrote:
> get_next_ino has a number of problems:
>
> - It uses and returns a uint, which is susceptible to become overflowed
>   if a lot of volatile inodes that use get_next_ino are created.
> - It's global, with no specificity per-sb or even per-filesystem. This
>   means it's not that difficult to cause inode number wraparounds on a
>   single device, which can result in having multiple distinct inodes
>   with the same inode number.
>
> This patch adds a per-superblock counter that mitigates the second case.
> This design also allows us to later have a specific i_ino size
> per-device, for example, allowing users to choose whether to use 32- or
> 64-bit inodes for each tmpfs mount. This is implemented in the next
> commit.
>
> Signed-off-by: Chris Down <chris@chrisdown.name>
> Reviewed-by: Amir Goldstein <amir73il@gmail.com>
> Cc: Hugh Dickins <hughd@google.com>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Al Viro <viro@zeniv.linux.org.uk>
> Cc: Matthew Wilcox <willy@infradead.org>
> Cc: Jeff Layton <jlayton@kernel.org>
> Cc: Johannes Weiner <hannes@cmpxchg.org>
> Cc: Tejun Heo <tj@kernel.org>
> Cc: linux-mm@kvack.org
> Cc: linux-fsdevel@vger.kernel.org
> Cc: linux-kernel@vger.kernel.org
> Cc: kernel-team@fb.com
> ---
>  include/linux/shmem_fs.h |  1 +
>  mm/shmem.c               | 30 +++++++++++++++++++++++++++++-
>  2 files changed, 30 insertions(+), 1 deletion(-)
>
> v5: Nothing in code, just resending with correct linux-mm domain.
>
> diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
> index de8e4b71e3ba..7fac91f490dc 100644
> --- a/include/linux/shmem_fs.h
> +++ b/include/linux/shmem_fs.h
> @@ -35,6 +35,7 @@ struct shmem_sb_info {
>  	unsigned char huge;	    /* Whether to try for hugepages */
>  	kuid_t uid;		    /* Mount uid for root directory */
>  	kgid_t gid;		    /* Mount gid for root directory */
> +	ino_t next_ino;		    /* The next per-sb inode number to use */
>  	struct mempolicy *mpol;     /* default memory policy for mappings */
>  	spinlock_t shrinklist_lock;   /* Protects shrinklist */
>  	struct list_head shrinklist;  /* List of shinkable inodes */
> diff --git a/mm/shmem.c b/mm/shmem.c
> index 8793e8cc1a48..9e97ba972225 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -2236,6 +2236,12 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
>  	return 0;
>  }
>  
> +/*
> + * shmem_get_inode - reserve, allocate, and initialise a new inode
> + *
> + * If this tmpfs is from kern_mount we use get_next_ino, which is global, since
> + * inum churn there is low and this avoids taking locks.
> + */
>  static struct inode *shmem_get_inode(struct super_block *sb, const struct inode *dir,
>  				     umode_t mode, dev_t dev, unsigned long flags)
>  {
> @@ -2248,7 +2254,28 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
>  
>  	inode = new_inode(sb);
>  	if (inode) {
> -		inode->i_ino = get_next_ino();
> +		if (sb->s_flags & SB_KERNMOUNT) {
> +			/*
> +			 * __shmem_file_setup, one of our callers, is lock-free:
> +			 * it doesn't hold stat_lock in shmem_reserve_inode
> +			 * since max_inodes is always 0, and is called from
> +			 * potentially unknown contexts. As such, use the global
> +			 * allocator which doesn't require the per-sb stat_lock.
> +			 */
> +			inode->i_ino = get_next_ino();
> +		} else {
> +			spin_lock(&sbinfo->stat_lock);

Use spin_lock will affect performance, how about define

unsigned long __percpu *last_ino_number; /* Last inode number */
atomic64_t shared_last_ino_number; /* Shared last inode number */
in shmem_sb_info, whose performance will be better?

> +			if (unlikely(sbinfo->next_ino > UINT_MAX)) {
> +				/*
> +				 * Emulate get_next_ino uint wraparound for
> +				 * compatibility
> +				 */
> +				sbinfo->next_ino = 1;
> +			}
> +			inode->i_ino = sbinfo->next_ino++;
> +			spin_unlock(&sbinfo->stat_lock);
> +		}
> +
>  		inode_init_owner(inode, dir, mode);
>  		inode->i_blocks = 0;
>  		inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
> @@ -3662,6 +3689,7 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
>  #else
>  	sb->s_flags |= SB_NOUSER;
>  #endif
> +	sbinfo->next_ino = 1;
>  	sbinfo->max_blocks = ctx->blocks;
>  	sbinfo->free_inodes = sbinfo->max_inodes = ctx->inodes;
>  	sbinfo->uid = ctx->uid;
Amir Goldstein Jan. 6, 2020, 6:41 a.m. UTC | #2
On Mon, Jan 6, 2020 at 4:04 AM zhengbin (A) <zhengbin13@huawei.com> wrote:
>
>
> On 2020/1/5 20:06, Chris Down wrote:
> > get_next_ino has a number of problems:
> >
> > - It uses and returns a uint, which is susceptible to become overflowed
> >   if a lot of volatile inodes that use get_next_ino are created.
> > - It's global, with no specificity per-sb or even per-filesystem. This
> >   means it's not that difficult to cause inode number wraparounds on a
> >   single device, which can result in having multiple distinct inodes
> >   with the same inode number.
> >
> > This patch adds a per-superblock counter that mitigates the second case.
> > This design also allows us to later have a specific i_ino size
> > per-device, for example, allowing users to choose whether to use 32- or
> > 64-bit inodes for each tmpfs mount. This is implemented in the next
> > commit.
> >
> > Signed-off-by: Chris Down <chris@chrisdown.name>
> > Reviewed-by: Amir Goldstein <amir73il@gmail.com>
> > Cc: Hugh Dickins <hughd@google.com>
> > Cc: Andrew Morton <akpm@linux-foundation.org>
> > Cc: Al Viro <viro@zeniv.linux.org.uk>
> > Cc: Matthew Wilcox <willy@infradead.org>
> > Cc: Jeff Layton <jlayton@kernel.org>
> > Cc: Johannes Weiner <hannes@cmpxchg.org>
> > Cc: Tejun Heo <tj@kernel.org>
> > Cc: linux-mm@kvack.org
> > Cc: linux-fsdevel@vger.kernel.org
> > Cc: linux-kernel@vger.kernel.org
> > Cc: kernel-team@fb.com
> > ---
> >  include/linux/shmem_fs.h |  1 +
> >  mm/shmem.c               | 30 +++++++++++++++++++++++++++++-
> >  2 files changed, 30 insertions(+), 1 deletion(-)
> >
> > v5: Nothing in code, just resending with correct linux-mm domain.
> >
> > diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
> > index de8e4b71e3ba..7fac91f490dc 100644
> > --- a/include/linux/shmem_fs.h
> > +++ b/include/linux/shmem_fs.h
> > @@ -35,6 +35,7 @@ struct shmem_sb_info {
> >       unsigned char huge;         /* Whether to try for hugepages */
> >       kuid_t uid;                 /* Mount uid for root directory */
> >       kgid_t gid;                 /* Mount gid for root directory */
> > +     ino_t next_ino;             /* The next per-sb inode number to use */
> >       struct mempolicy *mpol;     /* default memory policy for mappings */
> >       spinlock_t shrinklist_lock;   /* Protects shrinklist */
> >       struct list_head shrinklist;  /* List of shinkable inodes */
> > diff --git a/mm/shmem.c b/mm/shmem.c
> > index 8793e8cc1a48..9e97ba972225 100644
> > --- a/mm/shmem.c
> > +++ b/mm/shmem.c
> > @@ -2236,6 +2236,12 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
> >       return 0;
> >  }
> >
> > +/*
> > + * shmem_get_inode - reserve, allocate, and initialise a new inode
> > + *
> > + * If this tmpfs is from kern_mount we use get_next_ino, which is global, since
> > + * inum churn there is low and this avoids taking locks.
> > + */
> >  static struct inode *shmem_get_inode(struct super_block *sb, const struct inode *dir,
> >                                    umode_t mode, dev_t dev, unsigned long flags)
> >  {
> > @@ -2248,7 +2254,28 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
> >
> >       inode = new_inode(sb);
> >       if (inode) {
> > -             inode->i_ino = get_next_ino();
> > +             if (sb->s_flags & SB_KERNMOUNT) {
> > +                     /*
> > +                      * __shmem_file_setup, one of our callers, is lock-free:
> > +                      * it doesn't hold stat_lock in shmem_reserve_inode
> > +                      * since max_inodes is always 0, and is called from
> > +                      * potentially unknown contexts. As such, use the global
> > +                      * allocator which doesn't require the per-sb stat_lock.
> > +                      */
> > +                     inode->i_ino = get_next_ino();
> > +             } else {
> > +                     spin_lock(&sbinfo->stat_lock);
>
> Use spin_lock will affect performance, how about define
>
> unsigned long __percpu *last_ino_number; /* Last inode number */
> atomic64_t shared_last_ino_number; /* Shared last inode number */
> in shmem_sb_info, whose performance will be better?
>

Please take a look at shmem_reserve_inode().
spin lock is already being taken in shmem_get_inode()
so there is nothing to be gained from complicating next_ino counter.

This fact would have been a lot clearer if next_ino was incremented
inside shmem_reserve_inode() and its value returned to be used by
shmem_get_inode(), but I am also fine with code as it is with the
comment above.

Thanks,
Amir.
Chris Down Jan. 6, 2020, 1:17 p.m. UTC | #3
zhengbin (A) writes:
>Use spin_lock will affect performance

"Performance" isn't a binary. In discussions, you should avoid invoking the 
performance boogeyman without presenting any real-world data. :-)

We already have to take this spin lock before when setting the free inode 
count. The two sites can be merged, but it seems unnecessary to conflate their 
purpose at this time.
Hugh Dickins Jan. 7, 2020, 8:01 a.m. UTC | #4
On Mon, 6 Jan 2020, Amir Goldstein wrote:
> On Mon, Jan 6, 2020 at 4:04 AM zhengbin (A) <zhengbin13@huawei.com> wrote:
> > On 2020/1/5 20:06, Chris Down wrote:
> > > get_next_ino has a number of problems:
> > >
> > > - It uses and returns a uint, which is susceptible to become overflowed
> > >   if a lot of volatile inodes that use get_next_ino are created.
> > > - It's global, with no specificity per-sb or even per-filesystem. This
> > >   means it's not that difficult to cause inode number wraparounds on a
> > >   single device, which can result in having multiple distinct inodes
> > >   with the same inode number.
> > >
> > > This patch adds a per-superblock counter that mitigates the second case.
> > > This design also allows us to later have a specific i_ino size
> > > per-device, for example, allowing users to choose whether to use 32- or
> > > 64-bit inodes for each tmpfs mount. This is implemented in the next
> > > commit.
> > >
> > > Signed-off-by: Chris Down <chris@chrisdown.name>
> > > Reviewed-by: Amir Goldstein <amir73il@gmail.com>
> > > Cc: Hugh Dickins <hughd@google.com>
> > > Cc: Andrew Morton <akpm@linux-foundation.org>
> > > Cc: Al Viro <viro@zeniv.linux.org.uk>
> > > Cc: Matthew Wilcox <willy@infradead.org>
> > > Cc: Jeff Layton <jlayton@kernel.org>
> > > Cc: Johannes Weiner <hannes@cmpxchg.org>
> > > Cc: Tejun Heo <tj@kernel.org>
> > > Cc: linux-mm@kvack.org
> > > Cc: linux-fsdevel@vger.kernel.org
> > > Cc: linux-kernel@vger.kernel.org
> > > Cc: kernel-team@fb.com
> > > ---
> > >  include/linux/shmem_fs.h |  1 +
> > >  mm/shmem.c               | 30 +++++++++++++++++++++++++++++-
> > >  2 files changed, 30 insertions(+), 1 deletion(-)
> > >
> > > v5: Nothing in code, just resending with correct linux-mm domain.
> > >
> > > diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
> > > index de8e4b71e3ba..7fac91f490dc 100644
> > > --- a/include/linux/shmem_fs.h
> > > +++ b/include/linux/shmem_fs.h
> > > @@ -35,6 +35,7 @@ struct shmem_sb_info {
> > >       unsigned char huge;         /* Whether to try for hugepages */
> > >       kuid_t uid;                 /* Mount uid for root directory */
> > >       kgid_t gid;                 /* Mount gid for root directory */
> > > +     ino_t next_ino;             /* The next per-sb inode number to use */
> > >       struct mempolicy *mpol;     /* default memory policy for mappings */
> > >       spinlock_t shrinklist_lock;   /* Protects shrinklist */
> > >       struct list_head shrinklist;  /* List of shinkable inodes */
> > > diff --git a/mm/shmem.c b/mm/shmem.c
> > > index 8793e8cc1a48..9e97ba972225 100644
> > > --- a/mm/shmem.c
> > > +++ b/mm/shmem.c
> > > @@ -2236,6 +2236,12 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
> > >       return 0;
> > >  }
> > >
> > > +/*
> > > + * shmem_get_inode - reserve, allocate, and initialise a new inode
> > > + *
> > > + * If this tmpfs is from kern_mount we use get_next_ino, which is global, since
> > > + * inum churn there is low and this avoids taking locks.
> > > + */
> > >  static struct inode *shmem_get_inode(struct super_block *sb, const struct inode *dir,
> > >                                    umode_t mode, dev_t dev, unsigned long flags)
> > >  {
> > > @@ -2248,7 +2254,28 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
> > >
> > >       inode = new_inode(sb);
> > >       if (inode) {
> > > -             inode->i_ino = get_next_ino();
> > > +             if (sb->s_flags & SB_KERNMOUNT) {
> > > +                     /*
> > > +                      * __shmem_file_setup, one of our callers, is lock-free:
> > > +                      * it doesn't hold stat_lock in shmem_reserve_inode
> > > +                      * since max_inodes is always 0, and is called from
> > > +                      * potentially unknown contexts. As such, use the global
> > > +                      * allocator which doesn't require the per-sb stat_lock.
> > > +                      */
> > > +                     inode->i_ino = get_next_ino();
> > > +             } else {
> > > +                     spin_lock(&sbinfo->stat_lock);
> >
> > Use spin_lock will affect performance, how about define
> >
> > unsigned long __percpu *last_ino_number; /* Last inode number */
> > atomic64_t shared_last_ino_number; /* Shared last inode number */
> > in shmem_sb_info, whose performance will be better?
> >
> 
> Please take a look at shmem_reserve_inode().
> spin lock is already being taken in shmem_get_inode()
> so there is nothing to be gained from complicating next_ino counter.
> 
> This fact would have been a lot clearer if next_ino was incremented
> inside shmem_reserve_inode() and its value returned to be used by
> shmem_get_inode(), but I am also fine with code as it is with the
> comment above.

Chris, Amir, thank you both for all the work you have been putting
into this over the holiday.  I'm only now catching up, sorry.

It appears that what you are ending up with above is very close to
the patch Google has been using for several years.  I'm glad Chris
has explored some interesting options, disappointed that you had no
more success than I had in trying to solve it efficiently with 32-bit
inos, agree with the way in which Amir cut it back.  That we've come to
the same conclusions independently is good confirmation of this approach.

Only yesterday did I get to see that Amir had asked to see my patch on
the 27th: rediffed against 5.5-rc5, appended now below.  I'm showing it,
though it's known deficient in three ways (not to mention lack of config
or mount options, but I now see Dave Chinner has an interesting take on
those) - better make it visible to you now, than me delay you further.

It does indicate a couple of improvements to Chris's current patch:
reducing use of stat_lock, as Amir suggested (in both nr_inodes limited
and unlimited cases); and need to fix shmem_encode_fh(), which neither
of us did yet.  Where we should go from here, fix Chris's or fix mine,
I've not decided.

From: Hugh Dickins <hughd@google.com>
Date: Fri, 7 Aug 2015 20:08:53 -0700
Subject: [PATCH] tmpfs: provide 64-bit inode numbers

Some programs (including ld.so and clang) try to deduplicate opened
files by comparing st_dev and st_ino, which breaks down when two files
on the same tmpfs have the same inode number: we are now hitting this
problem too often.  J. R. Okajima has reported the same problem with
backup tools.

tmpfs is currently sharing the same 32-bit get_next_ino() pool as
sockets, pipes, ramfs, hugetlbfs etc.  It delivers 32-bit inos even
on a 64-bit kernel for one reason: because if a 32-bit executable
compiled without _FILE_OFFSET_BITS=64 tries to stat a file with an
ino which won't fit in 32 bits, glibc fails that with EOVERFLOW.
glibc is being correct, but unhelpful: so 2.6.22 commit 866b04fccbf1
("inode numbering: make static counters in new_inode and iunique be
32 bits") forced get_next_ino() to unsigned int.

But whatever the upstream need to avoid surprising a mis-built
32-bit executable, Google production can be sure of the 64-bit
environment, and any remaining 32-bit executables built with
_FILE_OFFSET_BITS=64 (our files are sometimes bigger than 2G).

So, leave get_next_ino() as it is, but convert tmpfs to supply
unsigned long inos, and let each superblock keep its own account:
it was weird to be sharing a pool with such disparate uses before.

shmem_reserve_inode() already provides a good place to do this;
and normally it has to take stat_lock to update free_inodes, so
no overhead to increment its next_ino under the same lock.  But
if it's the internal shm_mnt, or mounted with "nr_inodes=0" to
increase scalability by avoiding that stat_lock, then use the
same percpu batching technique as get_next_ino().

Take on board 4.2 commit 2adc376c5519 ("vfs: avoid creation of
inode number 0 in get_next_ino"): for safety, skip any ino whose
low 32 bits is 0.

Upstream?  That's tougher: maybe try to upstream this as is, and
see what falls out; maybe add ino32 or ino64 mount options before
trying; or maybe upstream has to stick with the 32-bit ino, and a
scheme more complicated than this be implemented for tmpfs.

Not-Yet-Signed-off-by: Hugh Dickins <hughd@google.com>

1) Probably needs minor corrections for the 32-bit kernel: I wasn't
   worrying about that at the time, and expect some "unsigned long"s
   I didn't need to change for the 64-bit kernel actually need to be
   "u64"s or "ino_t"s now.
2) The "return 1" from shmem_encode_fh() would nowadays be written as
   "return FILEID_INO32_GEN" (and overlayfs takes particular interest
   in that fileid); yet it already put the high 32 bits of the ino into
   the fh: probably needs a different fileid once high 32 bits are set.
3) When this patch was written, a tmpfs could not be remounted from
   limited nr_inodes to unlimited nr_inodes=0; but that restriction
   was accidentally (though sensibly) lifted during v5.4's mount API
   mods; so would need to be taken into account (or reverted) here.
---

 include/linux/shmem_fs.h |    2 +
 mm/shmem.c               |   59 ++++++++++++++++++++++++++++++++++---
 2 files changed, 57 insertions(+), 4 deletions(-)

--- 5.5-rc5/include/linux/shmem_fs.h	2019-11-24 16:32:01.000000000 -0800
+++ linux/include/linux/shmem_fs.h	2020-01-06 19:58:04.487301841 -0800
@@ -30,6 +30,8 @@ struct shmem_sb_info {
 	struct percpu_counter used_blocks;  /* How many are allocated */
 	unsigned long max_inodes;   /* How many inodes are allowed */
 	unsigned long free_inodes;  /* How many are left for allocation */
+	unsigned long next_ino;	    /* Next inode number to be allocated */
+	unsigned long __percpu *ino_batch; /* Next per cpu, if unlimited */
 	spinlock_t stat_lock;	    /* Serialize shmem_sb_info changes */
 	umode_t mode;		    /* Mount mode for root directory */
 	unsigned char huge;	    /* Whether to try for hugepages */
--- 5.5-rc5/mm/shmem.c	2019-12-08 18:04:55.053566640 -0800
+++ linux/mm/shmem.c	2020-01-06 19:58:04.487301841 -0800
@@ -261,9 +261,24 @@ bool vma_is_shmem(struct vm_area_struct
 static LIST_HEAD(shmem_swaplist);
 static DEFINE_MUTEX(shmem_swaplist_mutex);
 
-static int shmem_reserve_inode(struct super_block *sb)
+/*
+ * shmem_reserve_inode() reserves "space" for a shmem inode, and allocates
+ * an ino.  Unlike get_next_ino() in fs/inode.c, the 64-bit kernel here goes
+ * for a 64-bit st_ino, expecting even 32-bit userspace to have been built
+ * with _FILE_OFFSET_BITS=64 nowadays; but lest it has not, each sb uses its
+ * own supply, independent of sockets and pipes etc, and of other tmpfs sbs.
+ * Maybe add a 32-bit ino mount option later, if it turns out to be needed.
+ * Don't worry about 64-bit ino support from a 32-bit kernel at this time.
+ *
+ * shmem_reserve_inode() is also called when making a hard link, to allow for
+ * the space needed by each dentry; but no new ino is wanted then (NULL inop).
+ */
+#define SHMEM_INO_BATCH 1024
+static int shmem_reserve_inode(struct super_block *sb, unsigned long *inop)
 {
 	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
+	unsigned long ino;
+
 	if (sbinfo->max_inodes) {
 		spin_lock(&sbinfo->stat_lock);
 		if (!sbinfo->free_inodes) {
@@ -271,7 +286,35 @@ static int shmem_reserve_inode(struct su
 			return -ENOSPC;
 		}
 		sbinfo->free_inodes--;
+		if (inop) {
+			ino = sbinfo->next_ino++;
+			/* Avoid 0 in the low 32 bits: might appear deleted */
+			if (unlikely((unsigned int)ino == 0))
+				ino = sbinfo->next_ino++;
+			*inop = ino;
+		}
 		spin_unlock(&sbinfo->stat_lock);
+	} else if (inop) {
+		unsigned long *next_ino;
+		/*
+		 * Internal shm_mnt, or mounted with unlimited nr_inodes=0 for
+		 * maximum scalability: we don't need to take stat_lock every
+		 * time here, so use percpu batching like get_next_ino().
+		 */
+		next_ino = per_cpu_ptr(sbinfo->ino_batch, get_cpu());
+		ino = *next_ino;
+		if (unlikely((ino & (SHMEM_INO_BATCH-1)) == 0)) {
+			spin_lock(&sbinfo->stat_lock);
+			ino = sbinfo->next_ino;
+			sbinfo->next_ino += SHMEM_INO_BATCH;
+			spin_unlock(&sbinfo->stat_lock);
+			/* Avoid 0 in the low 32 bits: might appear deleted */
+			if (unlikely((unsigned int)ino == 0))
+				ino++;
+		}
+		*inop = ino;
+		*next_ino = ++ino;
+		put_cpu();
 	}
 	return 0;
 }
@@ -2241,13 +2284,14 @@ static struct inode *shmem_get_inode(str
 	struct inode *inode;
 	struct shmem_inode_info *info;
 	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
+	unsigned long ino;
 
-	if (shmem_reserve_inode(sb))
+	if (shmem_reserve_inode(sb, &ino))
 		return NULL;
 
 	inode = new_inode(sb);
 	if (inode) {
-		inode->i_ino = get_next_ino();
+		inode->i_ino = ino;
 		inode_init_owner(inode, dir, mode);
 		inode->i_blocks = 0;
 		inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
@@ -2960,7 +3004,7 @@ static int shmem_link(struct dentry *old
 	 * first link must skip that, to get the accounting right.
 	 */
 	if (inode->i_nlink) {
-		ret = shmem_reserve_inode(inode->i_sb);
+		ret = shmem_reserve_inode(inode->i_sb, NULL);
 		if (ret)
 			goto out;
 	}
@@ -3621,6 +3665,7 @@ static void shmem_put_super(struct super
 {
 	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
 
+	free_percpu(sbinfo->ino_batch);
 	percpu_counter_destroy(&sbinfo->used_blocks);
 	mpol_put(sbinfo->mpol);
 	kfree(sbinfo);
@@ -3663,6 +3708,12 @@ static int shmem_fill_super(struct super
 #endif
 	sbinfo->max_blocks = ctx->blocks;
 	sbinfo->free_inodes = sbinfo->max_inodes = ctx->inodes;
+	/* If inodes are unlimited for scalability, use percpu ino batching */
+	if (!sbinfo->max_inodes) {
+		sbinfo->ino_batch = alloc_percpu(unsigned long);
+		if (!sbinfo->ino_batch)
+			goto failed;
+	}
 	sbinfo->uid = ctx->uid;
 	sbinfo->gid = ctx->gid;
 	sbinfo->mode = ctx->mode;
Amir Goldstein Jan. 7, 2020, 8:35 a.m. UTC | #5
> Chris, Amir, thank you both for all the work you have been putting
> into this over the holiday.  I'm only now catching up, sorry.
>
> It appears that what you are ending up with above is very close to
> the patch Google has been using for several years.  I'm glad Chris
> has explored some interesting options, disappointed that you had no
> more success than I had in trying to solve it efficiently with 32-bit
> inos, agree with the way in which Amir cut it back.  That we've come to
> the same conclusions independently is good confirmation of this approach.
>

Indeed :)

> Only yesterday did I get to see that Amir had asked to see my patch on
> the 27th: rediffed against 5.5-rc5, appended now below.  I'm showing it,
> though it's known deficient in three ways (not to mention lack of config
> or mount options, but I now see Dave Chinner has an interesting take on
> those) - better make it visible to you now, than me delay you further.
>
> It does indicate a couple of improvements to Chris's current patch:
> reducing use of stat_lock, as Amir suggested (in both nr_inodes limited
> and unlimited cases); and need to fix shmem_encode_fh(), which neither
> of us did yet.  Where we should go from here, fix Chris's or fix mine,
> I've not decided.
>

I vote in favor or best of both patches to result in a simpler outcome:
1. use inop approach from Hugh's patch
2. use get_next_ino() instead of per-sb ino_batch for kern_mount

Hugh, do you have any evidence or suspect that shmem_file_setup()
could be contributing to depleting the global get_next_ino pool?
Do you have an objection to the combination above?

> From: Hugh Dickins <hughd@google.com>
> Date: Fri, 7 Aug 2015 20:08:53 -0700
> Subject: [PATCH] tmpfs: provide 64-bit inode numbers
>
> Some programs (including ld.so and clang) try to deduplicate opened
> files by comparing st_dev and st_ino, which breaks down when two files
> on the same tmpfs have the same inode number: we are now hitting this
> problem too often.  J. R. Okajima has reported the same problem with
> backup tools.
>
> tmpfs is currently sharing the same 32-bit get_next_ino() pool as
> sockets, pipes, ramfs, hugetlbfs etc.  It delivers 32-bit inos even
> on a 64-bit kernel for one reason: because if a 32-bit executable
> compiled without _FILE_OFFSET_BITS=64 tries to stat a file with an
> ino which won't fit in 32 bits, glibc fails that with EOVERFLOW.
> glibc is being correct, but unhelpful: so 2.6.22 commit 866b04fccbf1
> ("inode numbering: make static counters in new_inode and iunique be
> 32 bits") forced get_next_ino() to unsigned int.
>
> But whatever the upstream need to avoid surprising a mis-built
> 32-bit executable, Google production can be sure of the 64-bit
> environment, and any remaining 32-bit executables built with
> _FILE_OFFSET_BITS=64 (our files are sometimes bigger than 2G).
>
> So, leave get_next_ino() as it is, but convert tmpfs to supply
> unsigned long inos, and let each superblock keep its own account:
> it was weird to be sharing a pool with such disparate uses before.
>
> shmem_reserve_inode() already provides a good place to do this;
> and normally it has to take stat_lock to update free_inodes, so
> no overhead to increment its next_ino under the same lock.  But
> if it's the internal shm_mnt, or mounted with "nr_inodes=0" to
> increase scalability by avoiding that stat_lock, then use the
> same percpu batching technique as get_next_ino().
>
> Take on board 4.2 commit 2adc376c5519 ("vfs: avoid creation of
> inode number 0 in get_next_ino"): for safety, skip any ino whose
> low 32 bits is 0.
>
> Upstream?  That's tougher: maybe try to upstream this as is, and
> see what falls out; maybe add ino32 or ino64 mount options before
> trying; or maybe upstream has to stick with the 32-bit ino, and a
> scheme more complicated than this be implemented for tmpfs.
>
> Not-Yet-Signed-off-by: Hugh Dickins <hughd@google.com>
>
> 1) Probably needs minor corrections for the 32-bit kernel: I wasn't
>    worrying about that at the time, and expect some "unsigned long"s
>    I didn't need to change for the 64-bit kernel actually need to be
>    "u64"s or "ino_t"s now.
> 2) The "return 1" from shmem_encode_fh() would nowadays be written as
>    "return FILEID_INO32_GEN" (and overlayfs takes particular interest
>    in that fileid); yet it already put the high 32 bits of the ino into
>    the fh: probably needs a different fileid once high 32 bits are set.

Nice spotting, but this really isn't a problem for overlayfs.
I agree that it would be nice to follow the same practice as xfs with
XFS_FILEID_TYPE_64FLAG, but as one can see from the private
flag, this is by no means a standard of any sort.

As the name_to_handle_at() man page says:
"Other than the use of the handle_bytes field, the caller should treat the
 file_handle structure as an opaque data type: the handle_type and f_handle
 fields are needed only by a  subsequent call to open_by_handle_at()."

It is true that one of my initial versions was checking value returned from
encode_fh, but Miklos has pointed out to me that this value is arbitrary
and we shouldn't rely on it.

In current overlayfs, the value FILEID_INO32_GEN is only used internally
to indicate the case where filesystem has no encode_fh op (see comment
above ovl_can_decode_fh()).  IOW, only the special case of encoding
by the default export_encode_fh() is considered a proof for 32bit inodes.
So tmpfs has never been considered as a 32bit inodes filesystem by
overlayfs.

Thanks,
Amir.
Hugh Dickins Jan. 8, 2020, 10:58 a.m. UTC | #6
On Tue, 7 Jan 2020, Amir Goldstein wrote:
> 
> I vote in favor or best of both patches to result in a simpler outcome:
> 1. use inop approach from Hugh's patch
> 2. use get_next_ino() instead of per-sb ino_batch for kern_mount
> 
> Hugh, do you have any evidence or suspect that shmem_file_setup()
> could be contributing to depleting the global get_next_ino pool?

Depends on what the system is used for: on one system, it will make
very little use of that pool, on another it will be one of the major
depleters of the pool.  Generally, it would be kinder to the other
users of the pool (those that might also care about ino duplication)
if shmem were to cede all use of it; but a bigger patch, yes.

> Do you have an objection to the combination above?

Objection would be too strong: I'm uncomfortable with it, and not
tempted to replace our internal implementation by one reverting to
use get_next_ino(); but not as uncomfortable as I am with holding
up progress on the issue.

Uncomfortable because of the "depletion" you mention.  Uncomfortable
because half the reason for ever offering the unlimited "nr_inodes=0"
mount option was to avoid stat_lock overhead (though, for all I know,
maybe nobody anywhere uses that option now - and I'll be surprised if
0-day uses it and reports any slowdown).

Also uncomfortable because your (excellent, and I'd never thought
about it that way) argument that the shm_mnt objects are internal
and unstatable (that may not resemble how you expressed it, I've not
gone back to search the mails to find where you made the point), is
not entirely correct nowadays - memfd_create()d objects come from
that same shm_mnt, and they can be fstat()ed.  What is the
likelihood that anything would care about duplicated inos of
memfd_create()d objects?  Fairly low, I think we'll agree; and
we can probably also say that their inos are an odd implementation
detail that no memfd_create() user should depend upon anyway.  But
I mention it in case it changes your own feeling about the shm_mnt.

> > Not-Yet-Signed-off-by: Hugh Dickins <hughd@google.com>
> >
> > 1) Probably needs minor corrections for the 32-bit kernel: I wasn't
> >    worrying about that at the time, and expect some "unsigned long"s
> >    I didn't need to change for the 64-bit kernel actually need to be
> >    "u64"s or "ino_t"s now.
> > 2) The "return 1" from shmem_encode_fh() would nowadays be written as
> >    "return FILEID_INO32_GEN" (and overlayfs takes particular interest
> >    in that fileid); yet it already put the high 32 bits of the ino into
> >    the fh: probably needs a different fileid once high 32 bits are set.
> 
> Nice spotting, but this really isn't a problem for overlayfs.
> I agree that it would be nice to follow the same practice as xfs with
> XFS_FILEID_TYPE_64FLAG, but as one can see from the private
> flag, this is by no means a standard of any sort.
> 
> As the name_to_handle_at() man page says:
> "Other than the use of the handle_bytes field, the caller should treat the
>  file_handle structure as an opaque data type: the handle_type and f_handle
>  fields are needed only by a  subsequent call to open_by_handle_at()."
> 
> It is true that one of my initial versions was checking value returned from
> encode_fh, but Miklos has pointed out to me that this value is arbitrary
> and we shouldn't rely on it.
> 
> In current overlayfs, the value FILEID_INO32_GEN is only used internally
> to indicate the case where filesystem has no encode_fh op (see comment
> above ovl_can_decode_fh()).  IOW, only the special case of encoding
> by the default export_encode_fh() is considered a proof for 32bit inodes.
> So tmpfs has never been considered as a 32bit inodes filesystem by
> overlayfs.

Thanks, you know far more about encode_fh() than I ever shall, so for
now I'll take your word for it that we don't need to make any change
there for this 64-bit ino support.  One day I should look back, go
through the encode_fh() callsites, and decide what that "return 1"
really ought to say.

It's inconvenient, I'm sorry, but I shall have to go quiet again
for a few days - I'm here, but cannot afford to split my attention.

Hugh
Amir Goldstein Jan. 8, 2020, 12:51 p.m. UTC | #7
On Wed, Jan 8, 2020 at 12:59 PM Hugh Dickins <hughd@google.com> wrote:
>
> On Tue, 7 Jan 2020, Amir Goldstein wrote:
> >
> > I vote in favor or best of both patches to result in a simpler outcome:
> > 1. use inop approach from Hugh's patch
> > 2. use get_next_ino() instead of per-sb ino_batch for kern_mount
> >
> > Hugh, do you have any evidence or suspect that shmem_file_setup()
> > could be contributing to depleting the global get_next_ino pool?
>
> Depends on what the system is used for: on one system, it will make
> very little use of that pool, on another it will be one of the major
> depleters of the pool.  Generally, it would be kinder to the other
> users of the pool (those that might also care about ino duplication)
> if shmem were to cede all use of it; but a bigger patch, yes.
>
> > Do you have an objection to the combination above?
>
> Objection would be too strong: I'm uncomfortable with it, and not
> tempted to replace our internal implementation by one reverting to
> use get_next_ino(); but not as uncomfortable as I am with holding
> up progress on the issue.
>
> Uncomfortable because of the "depletion" you mention.  Uncomfortable
> because half the reason for ever offering the unlimited "nr_inodes=0"
> mount option was to avoid stat_lock overhead (though, for all I know,
> maybe nobody anywhere uses that option now - and I'll be surprised if
> 0-day uses it and reports any slowdown).
>
> Also uncomfortable because your (excellent, and I'd never thought
> about it that way) argument that the shm_mnt objects are internal
> and unstatable (that may not resemble how you expressed it, I've not
> gone back to search the mails to find where you made the point), is
> not entirely correct nowadays - memfd_create()d objects come from
> that same shm_mnt, and they can be fstat()ed.  What is the
> likelihood that anything would care about duplicated inos of
> memfd_create()d objects?  Fairly low, I think we'll agree; and
> we can probably also say that their inos are an odd implementation
> detail that no memfd_create() user should depend upon anyway.  But
> I mention it in case it changes your own feeling about the shm_mnt.
>

I have no strong feeling either. I just didn't know how intensive its use
is. You have provided sufficient arguments IMO to go with your version
of the patch.

> > > Not-Yet-Signed-off-by: Hugh Dickins <hughd@google.com>
> > >
> > > 1) Probably needs minor corrections for the 32-bit kernel: I wasn't
> > >    worrying about that at the time, and expect some "unsigned long"s
> > >    I didn't need to change for the 64-bit kernel actually need to be
> > >    "u64"s or "ino_t"s now.
> > > 2) The "return 1" from shmem_encode_fh() would nowadays be written as
> > >    "return FILEID_INO32_GEN" (and overlayfs takes particular interest
> > >    in that fileid); yet it already put the high 32 bits of the ino into
> > >    the fh: probably needs a different fileid once high 32 bits are set.
> >
> > Nice spotting, but this really isn't a problem for overlayfs.
> > I agree that it would be nice to follow the same practice as xfs with
> > XFS_FILEID_TYPE_64FLAG, but as one can see from the private
> > flag, this is by no means a standard of any sort.
> >
> > As the name_to_handle_at() man page says:
> > "Other than the use of the handle_bytes field, the caller should treat the
> >  file_handle structure as an opaque data type: the handle_type and f_handle
> >  fields are needed only by a  subsequent call to open_by_handle_at()."
> >
> > It is true that one of my initial versions was checking value returned from
> > encode_fh, but Miklos has pointed out to me that this value is arbitrary
> > and we shouldn't rely on it.
> >
> > In current overlayfs, the value FILEID_INO32_GEN is only used internally
> > to indicate the case where filesystem has no encode_fh op (see comment
> > above ovl_can_decode_fh()).  IOW, only the special case of encoding
> > by the default export_encode_fh() is considered a proof for 32bit inodes.
> > So tmpfs has never been considered as a 32bit inodes filesystem by
> > overlayfs.
>
> Thanks, you know far more about encode_fh() than I ever shall, so for
> now I'll take your word for it that we don't need to make any change
> there for this 64-bit ino support.  One day I should look back, go
> through the encode_fh() callsites, and decide what that "return 1"
> really ought to say.
>

TBH, I meant to say that return 1 shouldn't matter functionally,
but it would be much nicer to change it to FILEID_INO64_GEN
and define that as 0x81 in include/linux/exportfs.h and actually
order the gen word after the inode64 words.

But that is independent of the per-sb ino change, because the
layout of the file handle has always been [gen,inode64] and never
really matched that standard of FILEID_INO32_GEN.

I may send a patch to later on to change that.

Thanks,
Amir.
diff mbox series

Patch

diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index de8e4b71e3ba..7fac91f490dc 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -35,6 +35,7 @@  struct shmem_sb_info {
 	unsigned char huge;	    /* Whether to try for hugepages */
 	kuid_t uid;		    /* Mount uid for root directory */
 	kgid_t gid;		    /* Mount gid for root directory */
+	ino_t next_ino;		    /* The next per-sb inode number to use */
 	struct mempolicy *mpol;     /* default memory policy for mappings */
 	spinlock_t shrinklist_lock;   /* Protects shrinklist */
 	struct list_head shrinklist;  /* List of shinkable inodes */
diff --git a/mm/shmem.c b/mm/shmem.c
index 8793e8cc1a48..9e97ba972225 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2236,6 +2236,12 @@  static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
 	return 0;
 }
 
+/*
+ * shmem_get_inode - reserve, allocate, and initialise a new inode
+ *
+ * If this tmpfs is from kern_mount we use get_next_ino, which is global, since
+ * inum churn there is low and this avoids taking locks.
+ */
 static struct inode *shmem_get_inode(struct super_block *sb, const struct inode *dir,
 				     umode_t mode, dev_t dev, unsigned long flags)
 {
@@ -2248,7 +2254,28 @@  static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
 
 	inode = new_inode(sb);
 	if (inode) {
-		inode->i_ino = get_next_ino();
+		if (sb->s_flags & SB_KERNMOUNT) {
+			/*
+			 * __shmem_file_setup, one of our callers, is lock-free:
+			 * it doesn't hold stat_lock in shmem_reserve_inode
+			 * since max_inodes is always 0, and is called from
+			 * potentially unknown contexts. As such, use the global
+			 * allocator which doesn't require the per-sb stat_lock.
+			 */
+			inode->i_ino = get_next_ino();
+		} else {
+			spin_lock(&sbinfo->stat_lock);
+			if (unlikely(sbinfo->next_ino > UINT_MAX)) {
+				/*
+				 * Emulate get_next_ino uint wraparound for
+				 * compatibility
+				 */
+				sbinfo->next_ino = 1;
+			}
+			inode->i_ino = sbinfo->next_ino++;
+			spin_unlock(&sbinfo->stat_lock);
+		}
+
 		inode_init_owner(inode, dir, mode);
 		inode->i_blocks = 0;
 		inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
@@ -3662,6 +3689,7 @@  static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
 #else
 	sb->s_flags |= SB_NOUSER;
 #endif
+	sbinfo->next_ino = 1;
 	sbinfo->max_blocks = ctx->blocks;
 	sbinfo->free_inodes = sbinfo->max_inodes = ctx->inodes;
 	sbinfo->uid = ctx->uid;