Message ID | 91b4ed6727712cb6d426cf60c740fe2f473f7638.1578225806.git.chris@chrisdown.name (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | fs: inode: shmem: Reduce risk of inum overflow | expand |
On 2020/1/5 20:06, Chris Down wrote: > get_next_ino has a number of problems: > > - It uses and returns a uint, which is susceptible to become overflowed > if a lot of volatile inodes that use get_next_ino are created. > - It's global, with no specificity per-sb or even per-filesystem. This > means it's not that difficult to cause inode number wraparounds on a > single device, which can result in having multiple distinct inodes > with the same inode number. > > This patch adds a per-superblock counter that mitigates the second case. > This design also allows us to later have a specific i_ino size > per-device, for example, allowing users to choose whether to use 32- or > 64-bit inodes for each tmpfs mount. This is implemented in the next > commit. > > Signed-off-by: Chris Down <chris@chrisdown.name> > Reviewed-by: Amir Goldstein <amir73il@gmail.com> > Cc: Hugh Dickins <hughd@google.com> > Cc: Andrew Morton <akpm@linux-foundation.org> > Cc: Al Viro <viro@zeniv.linux.org.uk> > Cc: Matthew Wilcox <willy@infradead.org> > Cc: Jeff Layton <jlayton@kernel.org> > Cc: Johannes Weiner <hannes@cmpxchg.org> > Cc: Tejun Heo <tj@kernel.org> > Cc: linux-mm@kvack.org > Cc: linux-fsdevel@vger.kernel.org > Cc: linux-kernel@vger.kernel.org > Cc: kernel-team@fb.com > --- > include/linux/shmem_fs.h | 1 + > mm/shmem.c | 30 +++++++++++++++++++++++++++++- > 2 files changed, 30 insertions(+), 1 deletion(-) > > v5: Nothing in code, just resending with correct linux-mm domain. > > diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h > index de8e4b71e3ba..7fac91f490dc 100644 > --- a/include/linux/shmem_fs.h > +++ b/include/linux/shmem_fs.h > @@ -35,6 +35,7 @@ struct shmem_sb_info { > unsigned char huge; /* Whether to try for hugepages */ > kuid_t uid; /* Mount uid for root directory */ > kgid_t gid; /* Mount gid for root directory */ > + ino_t next_ino; /* The next per-sb inode number to use */ > struct mempolicy *mpol; /* default memory policy for mappings */ > spinlock_t shrinklist_lock; /* Protects shrinklist */ > struct list_head shrinklist; /* List of shinkable inodes */ > diff --git a/mm/shmem.c b/mm/shmem.c > index 8793e8cc1a48..9e97ba972225 100644 > --- a/mm/shmem.c > +++ b/mm/shmem.c > @@ -2236,6 +2236,12 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma) > return 0; > } > > +/* > + * shmem_get_inode - reserve, allocate, and initialise a new inode > + * > + * If this tmpfs is from kern_mount we use get_next_ino, which is global, since > + * inum churn there is low and this avoids taking locks. > + */ > static struct inode *shmem_get_inode(struct super_block *sb, const struct inode *dir, > umode_t mode, dev_t dev, unsigned long flags) > { > @@ -2248,7 +2254,28 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode > > inode = new_inode(sb); > if (inode) { > - inode->i_ino = get_next_ino(); > + if (sb->s_flags & SB_KERNMOUNT) { > + /* > + * __shmem_file_setup, one of our callers, is lock-free: > + * it doesn't hold stat_lock in shmem_reserve_inode > + * since max_inodes is always 0, and is called from > + * potentially unknown contexts. As such, use the global > + * allocator which doesn't require the per-sb stat_lock. > + */ > + inode->i_ino = get_next_ino(); > + } else { > + spin_lock(&sbinfo->stat_lock); Use spin_lock will affect performance, how about define unsigned long __percpu *last_ino_number; /* Last inode number */ atomic64_t shared_last_ino_number; /* Shared last inode number */ in shmem_sb_info, whose performance will be better? > + if (unlikely(sbinfo->next_ino > UINT_MAX)) { > + /* > + * Emulate get_next_ino uint wraparound for > + * compatibility > + */ > + sbinfo->next_ino = 1; > + } > + inode->i_ino = sbinfo->next_ino++; > + spin_unlock(&sbinfo->stat_lock); > + } > + > inode_init_owner(inode, dir, mode); > inode->i_blocks = 0; > inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); > @@ -3662,6 +3689,7 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) > #else > sb->s_flags |= SB_NOUSER; > #endif > + sbinfo->next_ino = 1; > sbinfo->max_blocks = ctx->blocks; > sbinfo->free_inodes = sbinfo->max_inodes = ctx->inodes; > sbinfo->uid = ctx->uid;
On Mon, Jan 6, 2020 at 4:04 AM zhengbin (A) <zhengbin13@huawei.com> wrote: > > > On 2020/1/5 20:06, Chris Down wrote: > > get_next_ino has a number of problems: > > > > - It uses and returns a uint, which is susceptible to become overflowed > > if a lot of volatile inodes that use get_next_ino are created. > > - It's global, with no specificity per-sb or even per-filesystem. This > > means it's not that difficult to cause inode number wraparounds on a > > single device, which can result in having multiple distinct inodes > > with the same inode number. > > > > This patch adds a per-superblock counter that mitigates the second case. > > This design also allows us to later have a specific i_ino size > > per-device, for example, allowing users to choose whether to use 32- or > > 64-bit inodes for each tmpfs mount. This is implemented in the next > > commit. > > > > Signed-off-by: Chris Down <chris@chrisdown.name> > > Reviewed-by: Amir Goldstein <amir73il@gmail.com> > > Cc: Hugh Dickins <hughd@google.com> > > Cc: Andrew Morton <akpm@linux-foundation.org> > > Cc: Al Viro <viro@zeniv.linux.org.uk> > > Cc: Matthew Wilcox <willy@infradead.org> > > Cc: Jeff Layton <jlayton@kernel.org> > > Cc: Johannes Weiner <hannes@cmpxchg.org> > > Cc: Tejun Heo <tj@kernel.org> > > Cc: linux-mm@kvack.org > > Cc: linux-fsdevel@vger.kernel.org > > Cc: linux-kernel@vger.kernel.org > > Cc: kernel-team@fb.com > > --- > > include/linux/shmem_fs.h | 1 + > > mm/shmem.c | 30 +++++++++++++++++++++++++++++- > > 2 files changed, 30 insertions(+), 1 deletion(-) > > > > v5: Nothing in code, just resending with correct linux-mm domain. > > > > diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h > > index de8e4b71e3ba..7fac91f490dc 100644 > > --- a/include/linux/shmem_fs.h > > +++ b/include/linux/shmem_fs.h > > @@ -35,6 +35,7 @@ struct shmem_sb_info { > > unsigned char huge; /* Whether to try for hugepages */ > > kuid_t uid; /* Mount uid for root directory */ > > kgid_t gid; /* Mount gid for root directory */ > > + ino_t next_ino; /* The next per-sb inode number to use */ > > struct mempolicy *mpol; /* default memory policy for mappings */ > > spinlock_t shrinklist_lock; /* Protects shrinklist */ > > struct list_head shrinklist; /* List of shinkable inodes */ > > diff --git a/mm/shmem.c b/mm/shmem.c > > index 8793e8cc1a48..9e97ba972225 100644 > > --- a/mm/shmem.c > > +++ b/mm/shmem.c > > @@ -2236,6 +2236,12 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma) > > return 0; > > } > > > > +/* > > + * shmem_get_inode - reserve, allocate, and initialise a new inode > > + * > > + * If this tmpfs is from kern_mount we use get_next_ino, which is global, since > > + * inum churn there is low and this avoids taking locks. > > + */ > > static struct inode *shmem_get_inode(struct super_block *sb, const struct inode *dir, > > umode_t mode, dev_t dev, unsigned long flags) > > { > > @@ -2248,7 +2254,28 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode > > > > inode = new_inode(sb); > > if (inode) { > > - inode->i_ino = get_next_ino(); > > + if (sb->s_flags & SB_KERNMOUNT) { > > + /* > > + * __shmem_file_setup, one of our callers, is lock-free: > > + * it doesn't hold stat_lock in shmem_reserve_inode > > + * since max_inodes is always 0, and is called from > > + * potentially unknown contexts. As such, use the global > > + * allocator which doesn't require the per-sb stat_lock. > > + */ > > + inode->i_ino = get_next_ino(); > > + } else { > > + spin_lock(&sbinfo->stat_lock); > > Use spin_lock will affect performance, how about define > > unsigned long __percpu *last_ino_number; /* Last inode number */ > atomic64_t shared_last_ino_number; /* Shared last inode number */ > in shmem_sb_info, whose performance will be better? > Please take a look at shmem_reserve_inode(). spin lock is already being taken in shmem_get_inode() so there is nothing to be gained from complicating next_ino counter. This fact would have been a lot clearer if next_ino was incremented inside shmem_reserve_inode() and its value returned to be used by shmem_get_inode(), but I am also fine with code as it is with the comment above. Thanks, Amir.
zhengbin (A) writes:
>Use spin_lock will affect performance
"Performance" isn't a binary. In discussions, you should avoid invoking the
performance boogeyman without presenting any real-world data. :-)
We already have to take this spin lock before when setting the free inode
count. The two sites can be merged, but it seems unnecessary to conflate their
purpose at this time.
On Mon, 6 Jan 2020, Amir Goldstein wrote: > On Mon, Jan 6, 2020 at 4:04 AM zhengbin (A) <zhengbin13@huawei.com> wrote: > > On 2020/1/5 20:06, Chris Down wrote: > > > get_next_ino has a number of problems: > > > > > > - It uses and returns a uint, which is susceptible to become overflowed > > > if a lot of volatile inodes that use get_next_ino are created. > > > - It's global, with no specificity per-sb or even per-filesystem. This > > > means it's not that difficult to cause inode number wraparounds on a > > > single device, which can result in having multiple distinct inodes > > > with the same inode number. > > > > > > This patch adds a per-superblock counter that mitigates the second case. > > > This design also allows us to later have a specific i_ino size > > > per-device, for example, allowing users to choose whether to use 32- or > > > 64-bit inodes for each tmpfs mount. This is implemented in the next > > > commit. > > > > > > Signed-off-by: Chris Down <chris@chrisdown.name> > > > Reviewed-by: Amir Goldstein <amir73il@gmail.com> > > > Cc: Hugh Dickins <hughd@google.com> > > > Cc: Andrew Morton <akpm@linux-foundation.org> > > > Cc: Al Viro <viro@zeniv.linux.org.uk> > > > Cc: Matthew Wilcox <willy@infradead.org> > > > Cc: Jeff Layton <jlayton@kernel.org> > > > Cc: Johannes Weiner <hannes@cmpxchg.org> > > > Cc: Tejun Heo <tj@kernel.org> > > > Cc: linux-mm@kvack.org > > > Cc: linux-fsdevel@vger.kernel.org > > > Cc: linux-kernel@vger.kernel.org > > > Cc: kernel-team@fb.com > > > --- > > > include/linux/shmem_fs.h | 1 + > > > mm/shmem.c | 30 +++++++++++++++++++++++++++++- > > > 2 files changed, 30 insertions(+), 1 deletion(-) > > > > > > v5: Nothing in code, just resending with correct linux-mm domain. > > > > > > diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h > > > index de8e4b71e3ba..7fac91f490dc 100644 > > > --- a/include/linux/shmem_fs.h > > > +++ b/include/linux/shmem_fs.h > > > @@ -35,6 +35,7 @@ struct shmem_sb_info { > > > unsigned char huge; /* Whether to try for hugepages */ > > > kuid_t uid; /* Mount uid for root directory */ > > > kgid_t gid; /* Mount gid for root directory */ > > > + ino_t next_ino; /* The next per-sb inode number to use */ > > > struct mempolicy *mpol; /* default memory policy for mappings */ > > > spinlock_t shrinklist_lock; /* Protects shrinklist */ > > > struct list_head shrinklist; /* List of shinkable inodes */ > > > diff --git a/mm/shmem.c b/mm/shmem.c > > > index 8793e8cc1a48..9e97ba972225 100644 > > > --- a/mm/shmem.c > > > +++ b/mm/shmem.c > > > @@ -2236,6 +2236,12 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma) > > > return 0; > > > } > > > > > > +/* > > > + * shmem_get_inode - reserve, allocate, and initialise a new inode > > > + * > > > + * If this tmpfs is from kern_mount we use get_next_ino, which is global, since > > > + * inum churn there is low and this avoids taking locks. > > > + */ > > > static struct inode *shmem_get_inode(struct super_block *sb, const struct inode *dir, > > > umode_t mode, dev_t dev, unsigned long flags) > > > { > > > @@ -2248,7 +2254,28 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode > > > > > > inode = new_inode(sb); > > > if (inode) { > > > - inode->i_ino = get_next_ino(); > > > + if (sb->s_flags & SB_KERNMOUNT) { > > > + /* > > > + * __shmem_file_setup, one of our callers, is lock-free: > > > + * it doesn't hold stat_lock in shmem_reserve_inode > > > + * since max_inodes is always 0, and is called from > > > + * potentially unknown contexts. As such, use the global > > > + * allocator which doesn't require the per-sb stat_lock. > > > + */ > > > + inode->i_ino = get_next_ino(); > > > + } else { > > > + spin_lock(&sbinfo->stat_lock); > > > > Use spin_lock will affect performance, how about define > > > > unsigned long __percpu *last_ino_number; /* Last inode number */ > > atomic64_t shared_last_ino_number; /* Shared last inode number */ > > in shmem_sb_info, whose performance will be better? > > > > Please take a look at shmem_reserve_inode(). > spin lock is already being taken in shmem_get_inode() > so there is nothing to be gained from complicating next_ino counter. > > This fact would have been a lot clearer if next_ino was incremented > inside shmem_reserve_inode() and its value returned to be used by > shmem_get_inode(), but I am also fine with code as it is with the > comment above. Chris, Amir, thank you both for all the work you have been putting into this over the holiday. I'm only now catching up, sorry. It appears that what you are ending up with above is very close to the patch Google has been using for several years. I'm glad Chris has explored some interesting options, disappointed that you had no more success than I had in trying to solve it efficiently with 32-bit inos, agree with the way in which Amir cut it back. That we've come to the same conclusions independently is good confirmation of this approach. Only yesterday did I get to see that Amir had asked to see my patch on the 27th: rediffed against 5.5-rc5, appended now below. I'm showing it, though it's known deficient in three ways (not to mention lack of config or mount options, but I now see Dave Chinner has an interesting take on those) - better make it visible to you now, than me delay you further. It does indicate a couple of improvements to Chris's current patch: reducing use of stat_lock, as Amir suggested (in both nr_inodes limited and unlimited cases); and need to fix shmem_encode_fh(), which neither of us did yet. Where we should go from here, fix Chris's or fix mine, I've not decided. From: Hugh Dickins <hughd@google.com> Date: Fri, 7 Aug 2015 20:08:53 -0700 Subject: [PATCH] tmpfs: provide 64-bit inode numbers Some programs (including ld.so and clang) try to deduplicate opened files by comparing st_dev and st_ino, which breaks down when two files on the same tmpfs have the same inode number: we are now hitting this problem too often. J. R. Okajima has reported the same problem with backup tools. tmpfs is currently sharing the same 32-bit get_next_ino() pool as sockets, pipes, ramfs, hugetlbfs etc. It delivers 32-bit inos even on a 64-bit kernel for one reason: because if a 32-bit executable compiled without _FILE_OFFSET_BITS=64 tries to stat a file with an ino which won't fit in 32 bits, glibc fails that with EOVERFLOW. glibc is being correct, but unhelpful: so 2.6.22 commit 866b04fccbf1 ("inode numbering: make static counters in new_inode and iunique be 32 bits") forced get_next_ino() to unsigned int. But whatever the upstream need to avoid surprising a mis-built 32-bit executable, Google production can be sure of the 64-bit environment, and any remaining 32-bit executables built with _FILE_OFFSET_BITS=64 (our files are sometimes bigger than 2G). So, leave get_next_ino() as it is, but convert tmpfs to supply unsigned long inos, and let each superblock keep its own account: it was weird to be sharing a pool with such disparate uses before. shmem_reserve_inode() already provides a good place to do this; and normally it has to take stat_lock to update free_inodes, so no overhead to increment its next_ino under the same lock. But if it's the internal shm_mnt, or mounted with "nr_inodes=0" to increase scalability by avoiding that stat_lock, then use the same percpu batching technique as get_next_ino(). Take on board 4.2 commit 2adc376c5519 ("vfs: avoid creation of inode number 0 in get_next_ino"): for safety, skip any ino whose low 32 bits is 0. Upstream? That's tougher: maybe try to upstream this as is, and see what falls out; maybe add ino32 or ino64 mount options before trying; or maybe upstream has to stick with the 32-bit ino, and a scheme more complicated than this be implemented for tmpfs. Not-Yet-Signed-off-by: Hugh Dickins <hughd@google.com> 1) Probably needs minor corrections for the 32-bit kernel: I wasn't worrying about that at the time, and expect some "unsigned long"s I didn't need to change for the 64-bit kernel actually need to be "u64"s or "ino_t"s now. 2) The "return 1" from shmem_encode_fh() would nowadays be written as "return FILEID_INO32_GEN" (and overlayfs takes particular interest in that fileid); yet it already put the high 32 bits of the ino into the fh: probably needs a different fileid once high 32 bits are set. 3) When this patch was written, a tmpfs could not be remounted from limited nr_inodes to unlimited nr_inodes=0; but that restriction was accidentally (though sensibly) lifted during v5.4's mount API mods; so would need to be taken into account (or reverted) here. --- include/linux/shmem_fs.h | 2 + mm/shmem.c | 59 ++++++++++++++++++++++++++++++++++--- 2 files changed, 57 insertions(+), 4 deletions(-) --- 5.5-rc5/include/linux/shmem_fs.h 2019-11-24 16:32:01.000000000 -0800 +++ linux/include/linux/shmem_fs.h 2020-01-06 19:58:04.487301841 -0800 @@ -30,6 +30,8 @@ struct shmem_sb_info { struct percpu_counter used_blocks; /* How many are allocated */ unsigned long max_inodes; /* How many inodes are allowed */ unsigned long free_inodes; /* How many are left for allocation */ + unsigned long next_ino; /* Next inode number to be allocated */ + unsigned long __percpu *ino_batch; /* Next per cpu, if unlimited */ spinlock_t stat_lock; /* Serialize shmem_sb_info changes */ umode_t mode; /* Mount mode for root directory */ unsigned char huge; /* Whether to try for hugepages */ --- 5.5-rc5/mm/shmem.c 2019-12-08 18:04:55.053566640 -0800 +++ linux/mm/shmem.c 2020-01-06 19:58:04.487301841 -0800 @@ -261,9 +261,24 @@ bool vma_is_shmem(struct vm_area_struct static LIST_HEAD(shmem_swaplist); static DEFINE_MUTEX(shmem_swaplist_mutex); -static int shmem_reserve_inode(struct super_block *sb) +/* + * shmem_reserve_inode() reserves "space" for a shmem inode, and allocates + * an ino. Unlike get_next_ino() in fs/inode.c, the 64-bit kernel here goes + * for a 64-bit st_ino, expecting even 32-bit userspace to have been built + * with _FILE_OFFSET_BITS=64 nowadays; but lest it has not, each sb uses its + * own supply, independent of sockets and pipes etc, and of other tmpfs sbs. + * Maybe add a 32-bit ino mount option later, if it turns out to be needed. + * Don't worry about 64-bit ino support from a 32-bit kernel at this time. + * + * shmem_reserve_inode() is also called when making a hard link, to allow for + * the space needed by each dentry; but no new ino is wanted then (NULL inop). + */ +#define SHMEM_INO_BATCH 1024 +static int shmem_reserve_inode(struct super_block *sb, unsigned long *inop) { struct shmem_sb_info *sbinfo = SHMEM_SB(sb); + unsigned long ino; + if (sbinfo->max_inodes) { spin_lock(&sbinfo->stat_lock); if (!sbinfo->free_inodes) { @@ -271,7 +286,35 @@ static int shmem_reserve_inode(struct su return -ENOSPC; } sbinfo->free_inodes--; + if (inop) { + ino = sbinfo->next_ino++; + /* Avoid 0 in the low 32 bits: might appear deleted */ + if (unlikely((unsigned int)ino == 0)) + ino = sbinfo->next_ino++; + *inop = ino; + } spin_unlock(&sbinfo->stat_lock); + } else if (inop) { + unsigned long *next_ino; + /* + * Internal shm_mnt, or mounted with unlimited nr_inodes=0 for + * maximum scalability: we don't need to take stat_lock every + * time here, so use percpu batching like get_next_ino(). + */ + next_ino = per_cpu_ptr(sbinfo->ino_batch, get_cpu()); + ino = *next_ino; + if (unlikely((ino & (SHMEM_INO_BATCH-1)) == 0)) { + spin_lock(&sbinfo->stat_lock); + ino = sbinfo->next_ino; + sbinfo->next_ino += SHMEM_INO_BATCH; + spin_unlock(&sbinfo->stat_lock); + /* Avoid 0 in the low 32 bits: might appear deleted */ + if (unlikely((unsigned int)ino == 0)) + ino++; + } + *inop = ino; + *next_ino = ++ino; + put_cpu(); } return 0; } @@ -2241,13 +2284,14 @@ static struct inode *shmem_get_inode(str struct inode *inode; struct shmem_inode_info *info; struct shmem_sb_info *sbinfo = SHMEM_SB(sb); + unsigned long ino; - if (shmem_reserve_inode(sb)) + if (shmem_reserve_inode(sb, &ino)) return NULL; inode = new_inode(sb); if (inode) { - inode->i_ino = get_next_ino(); + inode->i_ino = ino; inode_init_owner(inode, dir, mode); inode->i_blocks = 0; inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); @@ -2960,7 +3004,7 @@ static int shmem_link(struct dentry *old * first link must skip that, to get the accounting right. */ if (inode->i_nlink) { - ret = shmem_reserve_inode(inode->i_sb); + ret = shmem_reserve_inode(inode->i_sb, NULL); if (ret) goto out; } @@ -3621,6 +3665,7 @@ static void shmem_put_super(struct super { struct shmem_sb_info *sbinfo = SHMEM_SB(sb); + free_percpu(sbinfo->ino_batch); percpu_counter_destroy(&sbinfo->used_blocks); mpol_put(sbinfo->mpol); kfree(sbinfo); @@ -3663,6 +3708,12 @@ static int shmem_fill_super(struct super #endif sbinfo->max_blocks = ctx->blocks; sbinfo->free_inodes = sbinfo->max_inodes = ctx->inodes; + /* If inodes are unlimited for scalability, use percpu ino batching */ + if (!sbinfo->max_inodes) { + sbinfo->ino_batch = alloc_percpu(unsigned long); + if (!sbinfo->ino_batch) + goto failed; + } sbinfo->uid = ctx->uid; sbinfo->gid = ctx->gid; sbinfo->mode = ctx->mode;
> Chris, Amir, thank you both for all the work you have been putting > into this over the holiday. I'm only now catching up, sorry. > > It appears that what you are ending up with above is very close to > the patch Google has been using for several years. I'm glad Chris > has explored some interesting options, disappointed that you had no > more success than I had in trying to solve it efficiently with 32-bit > inos, agree with the way in which Amir cut it back. That we've come to > the same conclusions independently is good confirmation of this approach. > Indeed :) > Only yesterday did I get to see that Amir had asked to see my patch on > the 27th: rediffed against 5.5-rc5, appended now below. I'm showing it, > though it's known deficient in three ways (not to mention lack of config > or mount options, but I now see Dave Chinner has an interesting take on > those) - better make it visible to you now, than me delay you further. > > It does indicate a couple of improvements to Chris's current patch: > reducing use of stat_lock, as Amir suggested (in both nr_inodes limited > and unlimited cases); and need to fix shmem_encode_fh(), which neither > of us did yet. Where we should go from here, fix Chris's or fix mine, > I've not decided. > I vote in favor or best of both patches to result in a simpler outcome: 1. use inop approach from Hugh's patch 2. use get_next_ino() instead of per-sb ino_batch for kern_mount Hugh, do you have any evidence or suspect that shmem_file_setup() could be contributing to depleting the global get_next_ino pool? Do you have an objection to the combination above? > From: Hugh Dickins <hughd@google.com> > Date: Fri, 7 Aug 2015 20:08:53 -0700 > Subject: [PATCH] tmpfs: provide 64-bit inode numbers > > Some programs (including ld.so and clang) try to deduplicate opened > files by comparing st_dev and st_ino, which breaks down when two files > on the same tmpfs have the same inode number: we are now hitting this > problem too often. J. R. Okajima has reported the same problem with > backup tools. > > tmpfs is currently sharing the same 32-bit get_next_ino() pool as > sockets, pipes, ramfs, hugetlbfs etc. It delivers 32-bit inos even > on a 64-bit kernel for one reason: because if a 32-bit executable > compiled without _FILE_OFFSET_BITS=64 tries to stat a file with an > ino which won't fit in 32 bits, glibc fails that with EOVERFLOW. > glibc is being correct, but unhelpful: so 2.6.22 commit 866b04fccbf1 > ("inode numbering: make static counters in new_inode and iunique be > 32 bits") forced get_next_ino() to unsigned int. > > But whatever the upstream need to avoid surprising a mis-built > 32-bit executable, Google production can be sure of the 64-bit > environment, and any remaining 32-bit executables built with > _FILE_OFFSET_BITS=64 (our files are sometimes bigger than 2G). > > So, leave get_next_ino() as it is, but convert tmpfs to supply > unsigned long inos, and let each superblock keep its own account: > it was weird to be sharing a pool with such disparate uses before. > > shmem_reserve_inode() already provides a good place to do this; > and normally it has to take stat_lock to update free_inodes, so > no overhead to increment its next_ino under the same lock. But > if it's the internal shm_mnt, or mounted with "nr_inodes=0" to > increase scalability by avoiding that stat_lock, then use the > same percpu batching technique as get_next_ino(). > > Take on board 4.2 commit 2adc376c5519 ("vfs: avoid creation of > inode number 0 in get_next_ino"): for safety, skip any ino whose > low 32 bits is 0. > > Upstream? That's tougher: maybe try to upstream this as is, and > see what falls out; maybe add ino32 or ino64 mount options before > trying; or maybe upstream has to stick with the 32-bit ino, and a > scheme more complicated than this be implemented for tmpfs. > > Not-Yet-Signed-off-by: Hugh Dickins <hughd@google.com> > > 1) Probably needs minor corrections for the 32-bit kernel: I wasn't > worrying about that at the time, and expect some "unsigned long"s > I didn't need to change for the 64-bit kernel actually need to be > "u64"s or "ino_t"s now. > 2) The "return 1" from shmem_encode_fh() would nowadays be written as > "return FILEID_INO32_GEN" (and overlayfs takes particular interest > in that fileid); yet it already put the high 32 bits of the ino into > the fh: probably needs a different fileid once high 32 bits are set. Nice spotting, but this really isn't a problem for overlayfs. I agree that it would be nice to follow the same practice as xfs with XFS_FILEID_TYPE_64FLAG, but as one can see from the private flag, this is by no means a standard of any sort. As the name_to_handle_at() man page says: "Other than the use of the handle_bytes field, the caller should treat the file_handle structure as an opaque data type: the handle_type and f_handle fields are needed only by a subsequent call to open_by_handle_at()." It is true that one of my initial versions was checking value returned from encode_fh, but Miklos has pointed out to me that this value is arbitrary and we shouldn't rely on it. In current overlayfs, the value FILEID_INO32_GEN is only used internally to indicate the case where filesystem has no encode_fh op (see comment above ovl_can_decode_fh()). IOW, only the special case of encoding by the default export_encode_fh() is considered a proof for 32bit inodes. So tmpfs has never been considered as a 32bit inodes filesystem by overlayfs. Thanks, Amir.
On Tue, 7 Jan 2020, Amir Goldstein wrote: > > I vote in favor or best of both patches to result in a simpler outcome: > 1. use inop approach from Hugh's patch > 2. use get_next_ino() instead of per-sb ino_batch for kern_mount > > Hugh, do you have any evidence or suspect that shmem_file_setup() > could be contributing to depleting the global get_next_ino pool? Depends on what the system is used for: on one system, it will make very little use of that pool, on another it will be one of the major depleters of the pool. Generally, it would be kinder to the other users of the pool (those that might also care about ino duplication) if shmem were to cede all use of it; but a bigger patch, yes. > Do you have an objection to the combination above? Objection would be too strong: I'm uncomfortable with it, and not tempted to replace our internal implementation by one reverting to use get_next_ino(); but not as uncomfortable as I am with holding up progress on the issue. Uncomfortable because of the "depletion" you mention. Uncomfortable because half the reason for ever offering the unlimited "nr_inodes=0" mount option was to avoid stat_lock overhead (though, for all I know, maybe nobody anywhere uses that option now - and I'll be surprised if 0-day uses it and reports any slowdown). Also uncomfortable because your (excellent, and I'd never thought about it that way) argument that the shm_mnt objects are internal and unstatable (that may not resemble how you expressed it, I've not gone back to search the mails to find where you made the point), is not entirely correct nowadays - memfd_create()d objects come from that same shm_mnt, and they can be fstat()ed. What is the likelihood that anything would care about duplicated inos of memfd_create()d objects? Fairly low, I think we'll agree; and we can probably also say that their inos are an odd implementation detail that no memfd_create() user should depend upon anyway. But I mention it in case it changes your own feeling about the shm_mnt. > > Not-Yet-Signed-off-by: Hugh Dickins <hughd@google.com> > > > > 1) Probably needs minor corrections for the 32-bit kernel: I wasn't > > worrying about that at the time, and expect some "unsigned long"s > > I didn't need to change for the 64-bit kernel actually need to be > > "u64"s or "ino_t"s now. > > 2) The "return 1" from shmem_encode_fh() would nowadays be written as > > "return FILEID_INO32_GEN" (and overlayfs takes particular interest > > in that fileid); yet it already put the high 32 bits of the ino into > > the fh: probably needs a different fileid once high 32 bits are set. > > Nice spotting, but this really isn't a problem for overlayfs. > I agree that it would be nice to follow the same practice as xfs with > XFS_FILEID_TYPE_64FLAG, but as one can see from the private > flag, this is by no means a standard of any sort. > > As the name_to_handle_at() man page says: > "Other than the use of the handle_bytes field, the caller should treat the > file_handle structure as an opaque data type: the handle_type and f_handle > fields are needed only by a subsequent call to open_by_handle_at()." > > It is true that one of my initial versions was checking value returned from > encode_fh, but Miklos has pointed out to me that this value is arbitrary > and we shouldn't rely on it. > > In current overlayfs, the value FILEID_INO32_GEN is only used internally > to indicate the case where filesystem has no encode_fh op (see comment > above ovl_can_decode_fh()). IOW, only the special case of encoding > by the default export_encode_fh() is considered a proof for 32bit inodes. > So tmpfs has never been considered as a 32bit inodes filesystem by > overlayfs. Thanks, you know far more about encode_fh() than I ever shall, so for now I'll take your word for it that we don't need to make any change there for this 64-bit ino support. One day I should look back, go through the encode_fh() callsites, and decide what that "return 1" really ought to say. It's inconvenient, I'm sorry, but I shall have to go quiet again for a few days - I'm here, but cannot afford to split my attention. Hugh
On Wed, Jan 8, 2020 at 12:59 PM Hugh Dickins <hughd@google.com> wrote: > > On Tue, 7 Jan 2020, Amir Goldstein wrote: > > > > I vote in favor or best of both patches to result in a simpler outcome: > > 1. use inop approach from Hugh's patch > > 2. use get_next_ino() instead of per-sb ino_batch for kern_mount > > > > Hugh, do you have any evidence or suspect that shmem_file_setup() > > could be contributing to depleting the global get_next_ino pool? > > Depends on what the system is used for: on one system, it will make > very little use of that pool, on another it will be one of the major > depleters of the pool. Generally, it would be kinder to the other > users of the pool (those that might also care about ino duplication) > if shmem were to cede all use of it; but a bigger patch, yes. > > > Do you have an objection to the combination above? > > Objection would be too strong: I'm uncomfortable with it, and not > tempted to replace our internal implementation by one reverting to > use get_next_ino(); but not as uncomfortable as I am with holding > up progress on the issue. > > Uncomfortable because of the "depletion" you mention. Uncomfortable > because half the reason for ever offering the unlimited "nr_inodes=0" > mount option was to avoid stat_lock overhead (though, for all I know, > maybe nobody anywhere uses that option now - and I'll be surprised if > 0-day uses it and reports any slowdown). > > Also uncomfortable because your (excellent, and I'd never thought > about it that way) argument that the shm_mnt objects are internal > and unstatable (that may not resemble how you expressed it, I've not > gone back to search the mails to find where you made the point), is > not entirely correct nowadays - memfd_create()d objects come from > that same shm_mnt, and they can be fstat()ed. What is the > likelihood that anything would care about duplicated inos of > memfd_create()d objects? Fairly low, I think we'll agree; and > we can probably also say that their inos are an odd implementation > detail that no memfd_create() user should depend upon anyway. But > I mention it in case it changes your own feeling about the shm_mnt. > I have no strong feeling either. I just didn't know how intensive its use is. You have provided sufficient arguments IMO to go with your version of the patch. > > > Not-Yet-Signed-off-by: Hugh Dickins <hughd@google.com> > > > > > > 1) Probably needs minor corrections for the 32-bit kernel: I wasn't > > > worrying about that at the time, and expect some "unsigned long"s > > > I didn't need to change for the 64-bit kernel actually need to be > > > "u64"s or "ino_t"s now. > > > 2) The "return 1" from shmem_encode_fh() would nowadays be written as > > > "return FILEID_INO32_GEN" (and overlayfs takes particular interest > > > in that fileid); yet it already put the high 32 bits of the ino into > > > the fh: probably needs a different fileid once high 32 bits are set. > > > > Nice spotting, but this really isn't a problem for overlayfs. > > I agree that it would be nice to follow the same practice as xfs with > > XFS_FILEID_TYPE_64FLAG, but as one can see from the private > > flag, this is by no means a standard of any sort. > > > > As the name_to_handle_at() man page says: > > "Other than the use of the handle_bytes field, the caller should treat the > > file_handle structure as an opaque data type: the handle_type and f_handle > > fields are needed only by a subsequent call to open_by_handle_at()." > > > > It is true that one of my initial versions was checking value returned from > > encode_fh, but Miklos has pointed out to me that this value is arbitrary > > and we shouldn't rely on it. > > > > In current overlayfs, the value FILEID_INO32_GEN is only used internally > > to indicate the case where filesystem has no encode_fh op (see comment > > above ovl_can_decode_fh()). IOW, only the special case of encoding > > by the default export_encode_fh() is considered a proof for 32bit inodes. > > So tmpfs has never been considered as a 32bit inodes filesystem by > > overlayfs. > > Thanks, you know far more about encode_fh() than I ever shall, so for > now I'll take your word for it that we don't need to make any change > there for this 64-bit ino support. One day I should look back, go > through the encode_fh() callsites, and decide what that "return 1" > really ought to say. > TBH, I meant to say that return 1 shouldn't matter functionally, but it would be much nicer to change it to FILEID_INO64_GEN and define that as 0x81 in include/linux/exportfs.h and actually order the gen word after the inode64 words. But that is independent of the per-sb ino change, because the layout of the file handle has always been [gen,inode64] and never really matched that standard of FILEID_INO32_GEN. I may send a patch to later on to change that. Thanks, Amir.
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index de8e4b71e3ba..7fac91f490dc 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -35,6 +35,7 @@ struct shmem_sb_info { unsigned char huge; /* Whether to try for hugepages */ kuid_t uid; /* Mount uid for root directory */ kgid_t gid; /* Mount gid for root directory */ + ino_t next_ino; /* The next per-sb inode number to use */ struct mempolicy *mpol; /* default memory policy for mappings */ spinlock_t shrinklist_lock; /* Protects shrinklist */ struct list_head shrinklist; /* List of shinkable inodes */ diff --git a/mm/shmem.c b/mm/shmem.c index 8793e8cc1a48..9e97ba972225 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2236,6 +2236,12 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma) return 0; } +/* + * shmem_get_inode - reserve, allocate, and initialise a new inode + * + * If this tmpfs is from kern_mount we use get_next_ino, which is global, since + * inum churn there is low and this avoids taking locks. + */ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode *dir, umode_t mode, dev_t dev, unsigned long flags) { @@ -2248,7 +2254,28 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode inode = new_inode(sb); if (inode) { - inode->i_ino = get_next_ino(); + if (sb->s_flags & SB_KERNMOUNT) { + /* + * __shmem_file_setup, one of our callers, is lock-free: + * it doesn't hold stat_lock in shmem_reserve_inode + * since max_inodes is always 0, and is called from + * potentially unknown contexts. As such, use the global + * allocator which doesn't require the per-sb stat_lock. + */ + inode->i_ino = get_next_ino(); + } else { + spin_lock(&sbinfo->stat_lock); + if (unlikely(sbinfo->next_ino > UINT_MAX)) { + /* + * Emulate get_next_ino uint wraparound for + * compatibility + */ + sbinfo->next_ino = 1; + } + inode->i_ino = sbinfo->next_ino++; + spin_unlock(&sbinfo->stat_lock); + } + inode_init_owner(inode, dir, mode); inode->i_blocks = 0; inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); @@ -3662,6 +3689,7 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) #else sb->s_flags |= SB_NOUSER; #endif + sbinfo->next_ino = 1; sbinfo->max_blocks = ctx->blocks; sbinfo->free_inodes = sbinfo->max_inodes = ctx->inodes; sbinfo->uid = ctx->uid;