Message ID | 20110811064059.GU6342@yookeroo.fritz.box (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Wed, Aug 10, 2011 at 11:40 PM, David Gibson <david@gibson.dropbear.id.au> wrote: > > This patch, therefore, stores a pointer to the inode instead of the > address_space in the page private data for hugepages. More > importantly it correctly adjusts the reference count on the inodes > when they're added to the page private data. This ensures that the > inode (and therefore the super block) will not be freed before we use > it from free_huge_page. Looks sane, but I *really* want some acks from people who use/know hugetlbfs. Who would that be? I'm adding random people who have acked/signed-off patches to hugetlbfs recently.. Linus --- patch left quoted for new people --- > Signed-off-by: David Gibson <david@gibson.dropbear.id.au> > > Index: working-2.6/fs/hugetlbfs/inode.c > =================================================================== > --- working-2.6.orig/fs/hugetlbfs/inode.c 2011-08-10 16:45:47.864758406 +1000 > +++ working-2.6/fs/hugetlbfs/inode.c 2011-08-10 17:22:21.899638039 +1000 > @@ -874,10 +874,10 @@ out_free: > return -ENOMEM; > } > > -int hugetlb_get_quota(struct address_space *mapping, long delta) > +int hugetlb_get_quota(struct inode *inode, long delta) > { > int ret = 0; > - struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(mapping->host->i_sb); > + struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(inode->i_sb); > > if (sbinfo->free_blocks > -1) { > spin_lock(&sbinfo->stat_lock); > @@ -891,9 +891,9 @@ int hugetlb_get_quota(struct address_spa > return ret; > } > > -void hugetlb_put_quota(struct address_space *mapping, long delta) > +void hugetlb_put_quota(struct inode *inode, long delta) > { > - struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(mapping->host->i_sb); > + struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(inode->i_sb); > > if (sbinfo->free_blocks > -1) { > spin_lock(&sbinfo->stat_lock); > Index: working-2.6/include/linux/hugetlb.h > =================================================================== > --- working-2.6.orig/include/linux/hugetlb.h 2011-08-10 16:58:27.952527484 +1000 > +++ working-2.6/include/linux/hugetlb.h 2011-08-10 17:22:08.723572707 +1000 > @@ -171,8 +171,8 @@ extern const struct file_operations huge > extern const struct vm_operations_struct hugetlb_vm_ops; > struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct, > struct user_struct **user, int creat_flags); > -int hugetlb_get_quota(struct address_space *mapping, long delta); > -void hugetlb_put_quota(struct address_space *mapping, long delta); > +int hugetlb_get_quota(struct inode *inode, long delta); > +void hugetlb_put_quota(struct inode *inode, long delta); > > static inline int is_file_hugepages(struct file *file) > { > Index: working-2.6/mm/hugetlb.c > =================================================================== > --- working-2.6.orig/mm/hugetlb.c 2011-08-10 16:44:12.212284092 +1000 > +++ working-2.6/mm/hugetlb.c 2011-08-10 17:21:49.603477888 +1000 > @@ -533,10 +533,12 @@ static void free_huge_page(struct page * > */ > struct hstate *h = page_hstate(page); > int nid = page_to_nid(page); > - struct address_space *mapping; > + struct inode *inode; > > - mapping = (struct address_space *) page_private(page); > + inode = (struct inode *) page_private(page); > set_page_private(page, 0); > + iput(inode); > + > page->mapping = NULL; > BUG_ON(page_count(page)); > BUG_ON(page_mapcount(page)); > @@ -551,8 +553,8 @@ static void free_huge_page(struct page * > enqueue_huge_page(h, page); > } > spin_unlock(&hugetlb_lock); > - if (mapping) > - hugetlb_put_quota(mapping, 1); > + if (inode) > + hugetlb_put_quota(inode, 1); > } > > static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) > @@ -1035,7 +1037,7 @@ static struct page *alloc_huge_page(stru > if (chg < 0) > return ERR_PTR(-VM_FAULT_OOM); > if (chg) > - if (hugetlb_get_quota(inode->i_mapping, chg)) > + if (hugetlb_get_quota(inode, chg)) > return ERR_PTR(-VM_FAULT_SIGBUS); > > spin_lock(&hugetlb_lock); > @@ -1045,12 +1047,12 @@ static struct page *alloc_huge_page(stru > if (!page) { > page = alloc_buddy_huge_page(h, NUMA_NO_NODE); > if (!page) { > - hugetlb_put_quota(inode->i_mapping, chg); > + hugetlb_put_quota(inode, chg); > return ERR_PTR(-VM_FAULT_SIGBUS); > } > } > > - set_page_private(page, (unsigned long) mapping); > + set_page_private(page, (unsigned long) igrab(inode)); > > vma_commit_reservation(h, vma, addr); > > @@ -2086,7 +2088,8 @@ static void hugetlb_vm_op_close(struct v > > if (reserve) { > hugetlb_acct_memory(h, -reserve); > - hugetlb_put_quota(vma->vm_file->f_mapping, reserve); > + hugetlb_put_quota(vma->vm_file->f_mapping->host, > + reserve); > } > } > } > @@ -2884,7 +2887,7 @@ int hugetlb_reserve_pages(struct inode * > return chg; > > /* There must be enough filesystem quota for the mapping */ > - if (hugetlb_get_quota(inode->i_mapping, chg)) > + if (hugetlb_get_quota(inode, chg)) > return -ENOSPC; > > /* > @@ -2893,7 +2896,7 @@ int hugetlb_reserve_pages(struct inode * > */ > ret = hugetlb_acct_memory(h, chg); > if (ret < 0) { > - hugetlb_put_quota(inode->i_mapping, chg); > + hugetlb_put_quota(inode, chg); > return ret; > } > > @@ -2922,7 +2925,7 @@ void hugetlb_unreserve_pages(struct inod > inode->i_blocks -= (blocks_per_huge_page(h) * freed); > spin_unlock(&inode->i_lock); > > - hugetlb_put_quota(inode->i_mapping, (chg - freed)); > + hugetlb_put_quota(inode, (chg - freed)); > hugetlb_acct_memory(h, -(chg - freed)); > } > > > -- > David Gibson | I'll have my music baroque, and my code > david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ > | _way_ _around_! > http://www.ozlabs.org/~dgibson > -- > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > Please read the FAQ at http://www.tux.org/lkml/ > -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, Aug 12, 2011 at 9:48 AM, Linus Torvalds <torvalds@linux-foundation.org> wrote: > On Wed, Aug 10, 2011 at 11:40 PM, David Gibson > <david@gibson.dropbear.id.au> wrote: >> >> This patch, therefore, stores a pointer to the inode instead of the >> address_space in the page private data for hugepages. More >> importantly it correctly adjusts the reference count on the inodes >> when they're added to the page private data. This ensures that the >> inode (and therefore the super block) will not be freed before we use >> it from free_huge_page. > > Looks sane, but I *really* want some acks from people who use/know > hugetlbfs. Who would that be? I'm adding random people who have > acked/signed-off patches to hugetlbfs recently.. At least, code itself looks good to me but your random choice was failed. Maybe people you want are as follows. http://marc.info/?t=126928975800003&r=1&w=2 Ccing right persons.
On Fri, 12 Aug 2011, Minchan Kim wrote: > On Fri, Aug 12, 2011 at 9:48 AM, Linus Torvalds > <torvalds@linux-foundation.org> wrote: > > On Wed, Aug 10, 2011 at 11:40 PM, David Gibson > > <david@gibson.dropbear.id.au> wrote: > >> > >> This patch, therefore, stores a pointer to the inode instead of the > >> address_space in the page private data for hugepages. More > >> importantly it correctly adjusts the reference count on the inodes > >> when they're added to the page private data. This ensures that the > >> inode (and therefore the super block) will not be freed before we use > >> it from free_huge_page. > > > > Looks sane, but I *really* want some acks from people who use/know > > hugetlbfs. Who would that be? I'm adding random people who have > > acked/signed-off patches to hugetlbfs recently.. > > At least, code itself looks good to me but your random choice was failed. > Maybe people you want are as follows. > http://marc.info/?t=126928975800003&r=1&w=2 > > Ccing right persons. I don't know much about hugetlbfs these days, but I think the patch is very wrong. The real change is where alloc_huge_page() does igrab(inode) and free_huge_pages() does iput(inode)? That makes me very nervous, partly because a final iput() is a complex operation, which we wouldn't expect to be doing when "freeing" a page. My first worry was that free_huge_page() could actually get called at interrupt time (when it's in a pagevec of pages to be freed as a batch, then another put_page is done at interrupt time which frees that batch): I worried that we use spin_lock not spin_lock_irqsave on inode->i_lock. To be honest though, I've not followed up whether that's actually a possibility, the compound page path is too twisty for a quick answer; and even if it's a possibility, it's one that's already ignored in the case of hugetlb_lock. Setting that aside, I think this thing of grabbing a reference to inode for each page just does not work as you wish: when we unlink an inode, all its pages should be freed; but because they are themselves holding references to the inode, it and its pages stick around forever. A quick experiment with your patch versus without confirmed that: meminfo HugePages_Free stayed down with your patch, but went back to HugePages_Total without it. Please check, perhaps I'm just mistaken. Sorry, I've not looked into what a constructive alternative might be; and it's not the first time we've had this difficulty - it came up last year when the ->freepage function was added, that the inode may be gone by the time ->freepage(page) is called. On a side note, very good description - thank you, but I wish you'd split the patch into two, the fix and then the inode-instead-of-mapping cleanup. Though personally I'd prefer not to make that "cleanup": it's normal for a struct address space * to be used in struct page (if I delved I guess I'd find good reason why this one is in page->private instead of page->mapping: perhaps because it's needed after page->mapping is reset to NULL, perhaps because it's needed on COWed copies of hugetlbfs pages). Hugh
On Thu, Aug 11, 2011 at 04:40:59PM +1000, David Gibson wrote: > Linus, please apply > > hugetlbfs tracks the current usage of hugepages per hugetlbfs > mountpoint. To correctly track this when hugepages are released, it > must find the right hugetlbfs super_block from the struct page > available in free_huge_page(). a superblock is not a mountpoint, it's a filesystem instance. You can happily have a single filesystem mounted at multiple mount points. > However, this usage is buggy, because nothing ensures that the > address_space is not freed before all the hugepages that belonged to > it are. In practice that will usually be the case, but if extra page > references have been taken by e.g. drivers or kvm doing > get_user_pages() then the file, inode and address space may be > destroyed before all the pages. > > In addition, the quota functions use the mapping only to get the inode > then the super_block. However, most of the callers already have the > inode anyway and have to get the mapping from there. > > This patch, therefore, stores a pointer to the inode instead of the > address_space in the page private data for hugepages. What's sthe point? The lifetime of inode->i_mapping is exactly the same as that of the inode, except for those few filesystem that use one from a different inode (and then for the whole lifetime of the inode), so I can't see how your patch will make a difference. > More > importantly it correctly adjusts the reference count on the inodes > when they're added to the page private data. This ensures that the > inode (and therefore the super block) will not be freed before we use > it from free_huge_page. That seems like the real fix. And even if you'd still do the other bits it should be a separate patch/ -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, Aug 12, 2011 at 12:15:21PM -0700, Hugh Dickins wrote: > On Fri, 12 Aug 2011, Minchan Kim wrote: > > On Fri, Aug 12, 2011 at 9:48 AM, Linus Torvalds > > <torvalds@linux-foundation.org> wrote: > > > On Wed, Aug 10, 2011 at 11:40 PM, David Gibson > > > <david@gibson.dropbear.id.au> wrote: > > >> > > >> This patch, therefore, stores a pointer to the inode instead of the > > >> address_space in the page private data for hugepages. More > > >> importantly it correctly adjusts the reference count on the inodes > > >> when they're added to the page private data. This ensures that the > > >> inode (and therefore the super block) will not be freed before we use > > >> it from free_huge_page. > > > > > > Looks sane, but I *really* want some acks from people who use/know > > > hugetlbfs. Who would that be? I'm adding random people who have > > > acked/signed-off patches to hugetlbfs recently.. > > > > At least, code itself looks good to me but your random choice was failed. > > Maybe people you want are as follows. > > http://marc.info/?t=126928975800003&r=1&w=2 > > > > Ccing right persons. > > I don't know much about hugetlbfs these days, but I think the patch > is very wrong. > > The real change is where alloc_huge_page() does igrab(inode) and > free_huge_pages() does iput(inode)? > > That makes me very nervous, partly because a final iput() is a complex > operation, which we wouldn't expect to be doing when "freeing" a page. > > My first worry was that free_huge_page() could actually get called at > interrupt time (when it's in a pagevec of pages to be freed as a batch, > then another put_page is done at interrupt time which frees that batch): > I worried that we use spin_lock not spin_lock_irqsave on inode->i_lock. > To be honest though, I've not followed up whether that's actually a > possibility, the compound page path is too twisty for a quick answer; > and even if it's a possibility, it's one that's already ignored in the > case of hugetlb_lock. > > Setting that aside, I think this thing of grabbing a reference to inode > for each page just does not work as you wish: when we unlink an inode, > all its pages should be freed; but because they are themselves holding > references to the inode, it and its pages stick around forever. Ugh, yes. You're absolutely right. That circular reference will mess everything up. Thinking it through and testing fail. > A quick experiment with your patch versus without confirmed that: > meminfo HugePages_Free stayed down with your patch, but went back to > HugePages_Total without it. Please check, perhaps I'm just mistaken. > > Sorry, I've not looked into what a constructive alternative might be; > and it's not the first time we've had this difficulty - it came up last > year when the ->freepage function was added, that the inode may be gone > by the time ->freepage(page) is called. Ok, so. In fact the quota functions we call at free time only need the super block, not the inode per se. If we put a superblock pointer instead of an inode pointer in page private, and refcounted that, I think that should remove the circular ref. The only reason I didn't do it before is that the superblock refcounting functions didn't seem to be globally visible in an obvious way. Does that sound like a reasonable approach? > On a side note, very good description - thank you, but I wish you'd > split the patch into two, the fix and then the inode-instead-of-mapping > cleanup. Though personally I'd prefer not to make that "cleanup": it's > normal for a struct address space * to be used in struct page (if I delved > I guess I'd find good reason why this one is in page->private instead of > page->mapping: perhaps because it's needed after page->mapping is reset > to NULL, perhaps because it's needed on COWed copies of hugetlbfs pages). That is an interesting question. But it doesn't address the basic point. mappings aren't refcounted themselves, and as far as I can tell their lifetime is bound to that of their inode.
On Sat, 13 Aug 2011, David Gibson wrote: > On Fri, Aug 12, 2011 at 12:15:21PM -0700, Hugh Dickins wrote: > > > > Setting that aside, I think this thing of grabbing a reference to inode > > for each page just does not work as you wish: when we unlink an inode, > > all its pages should be freed; but because they are themselves holding > > references to the inode, it and its pages stick around forever. > > Ugh, yes. You're absolutely right. That circular reference will mess > everything up. Thinking it through and testing fail. > > > A quick experiment with your patch versus without confirmed that: > > meminfo HugePages_Free stayed down with your patch, but went back to > > HugePages_Total without it. Please check, perhaps I'm just mistaken. > > > > Sorry, I've not looked into what a constructive alternative might be; > > and it's not the first time we've had this difficulty - it came up last > > year when the ->freepage function was added, that the inode may be gone > > by the time ->freepage(page) is called. > > Ok, so. In fact the quota functions we call at free time only need > the super block, not the inode per se. If we put a superblock pointer > instead of an inode pointer in page private, and refcounted that, I > think that should remove the circular ref. The only reason I didn't > do it before is that the superblock refcounting functions didn't seem > to be globally visible in an obvious way. > > Does that sound like a reasonable approach? That does sound closer to a reaonable approach, but my guess is that it will suck you into a world of superblock mutexes and semaphores, which you cannot take at free_huge_page() time. It might be necessary to hang your own tiny structure off the superblock, with one refcount for the superblock, and one for each hugepage attached, you freeing that structure when the count goes down to zero from either direction. Whatever you do needs testing with lockdep and atomic sleep checks. I do dislike tying these separate levels together in such an unusual way, but it is a difficult problem and I don't know of an easy answer. Maybe we'll need to find a better answer for other reasons, it does come up from time to time e.g. recent race between evicting inode and nrpages going down to 0. You might care to take a look at how tmpfs (mm/shmem.c) deals with the equivalent issue there (sbinfo->used_blocks). But I expect you to conclude that hugetlbfs cannot afford the kind of approximations that tmpfs can afford. Although I think tmpfs is more correct, to be associating the count with pagecache (so the count goes down as soon as a page is truncated or evicted from pagecache), your fewer and huger pages, and reservation conventions, may well demand that the count stays up until the page is actually freed back to hugepool. And let's not pretend that what tmpfs does is wonderful: the strange shmem_recalc_inode() tries its best to notice when memory pressure has freed clean pages, but it never looks beyond the inode being accessed at the times it's called. Not at all satisfactory, but not actually an issue in practice, since we stopped allocating pages for simple reads from sparse file. I did want to convert tmpfs to use ->freepage(), but couldn't manage it without stable mapping - same problem as you have. Hugh -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Index: working-2.6/fs/hugetlbfs/inode.c =================================================================== --- working-2.6.orig/fs/hugetlbfs/inode.c 2011-08-10 16:45:47.864758406 +1000 +++ working-2.6/fs/hugetlbfs/inode.c 2011-08-10 17:22:21.899638039 +1000 @@ -874,10 +874,10 @@ out_free: return -ENOMEM; } -int hugetlb_get_quota(struct address_space *mapping, long delta) +int hugetlb_get_quota(struct inode *inode, long delta) { int ret = 0; - struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(mapping->host->i_sb); + struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(inode->i_sb); if (sbinfo->free_blocks > -1) { spin_lock(&sbinfo->stat_lock); @@ -891,9 +891,9 @@ int hugetlb_get_quota(struct address_spa return ret; } -void hugetlb_put_quota(struct address_space *mapping, long delta) +void hugetlb_put_quota(struct inode *inode, long delta) { - struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(mapping->host->i_sb); + struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(inode->i_sb); if (sbinfo->free_blocks > -1) { spin_lock(&sbinfo->stat_lock); Index: working-2.6/include/linux/hugetlb.h =================================================================== --- working-2.6.orig/include/linux/hugetlb.h 2011-08-10 16:58:27.952527484 +1000 +++ working-2.6/include/linux/hugetlb.h 2011-08-10 17:22:08.723572707 +1000 @@ -171,8 +171,8 @@ extern const struct file_operations huge extern const struct vm_operations_struct hugetlb_vm_ops; struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct, struct user_struct **user, int creat_flags); -int hugetlb_get_quota(struct address_space *mapping, long delta); -void hugetlb_put_quota(struct address_space *mapping, long delta); +int hugetlb_get_quota(struct inode *inode, long delta); +void hugetlb_put_quota(struct inode *inode, long delta); static inline int is_file_hugepages(struct file *file) { Index: working-2.6/mm/hugetlb.c =================================================================== --- working-2.6.orig/mm/hugetlb.c 2011-08-10 16:44:12.212284092 +1000 +++ working-2.6/mm/hugetlb.c 2011-08-10 17:21:49.603477888 +1000 @@ -533,10 +533,12 @@ static void free_huge_page(struct page * */ struct hstate *h = page_hstate(page); int nid = page_to_nid(page); - struct address_space *mapping; + struct inode *inode; - mapping = (struct address_space *) page_private(page); + inode = (struct inode *) page_private(page); set_page_private(page, 0); + iput(inode); + page->mapping = NULL; BUG_ON(page_count(page)); BUG_ON(page_mapcount(page)); @@ -551,8 +553,8 @@ static void free_huge_page(struct page * enqueue_huge_page(h, page); } spin_unlock(&hugetlb_lock); - if (mapping) - hugetlb_put_quota(mapping, 1); + if (inode) + hugetlb_put_quota(inode, 1); } static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) @@ -1035,7 +1037,7 @@ static struct page *alloc_huge_page(stru if (chg < 0) return ERR_PTR(-VM_FAULT_OOM); if (chg) - if (hugetlb_get_quota(inode->i_mapping, chg)) + if (hugetlb_get_quota(inode, chg)) return ERR_PTR(-VM_FAULT_SIGBUS); spin_lock(&hugetlb_lock); @@ -1045,12 +1047,12 @@ static struct page *alloc_huge_page(stru if (!page) { page = alloc_buddy_huge_page(h, NUMA_NO_NODE); if (!page) { - hugetlb_put_quota(inode->i_mapping, chg); + hugetlb_put_quota(inode, chg); return ERR_PTR(-VM_FAULT_SIGBUS); } } - set_page_private(page, (unsigned long) mapping); + set_page_private(page, (unsigned long) igrab(inode)); vma_commit_reservation(h, vma, addr); @@ -2086,7 +2088,8 @@ static void hugetlb_vm_op_close(struct v if (reserve) { hugetlb_acct_memory(h, -reserve); - hugetlb_put_quota(vma->vm_file->f_mapping, reserve); + hugetlb_put_quota(vma->vm_file->f_mapping->host, + reserve); } } } @@ -2884,7 +2887,7 @@ int hugetlb_reserve_pages(struct inode * return chg; /* There must be enough filesystem quota for the mapping */ - if (hugetlb_get_quota(inode->i_mapping, chg)) + if (hugetlb_get_quota(inode, chg)) return -ENOSPC; /* @@ -2893,7 +2896,7 @@ int hugetlb_reserve_pages(struct inode * */ ret = hugetlb_acct_memory(h, chg); if (ret < 0) { - hugetlb_put_quota(inode->i_mapping, chg); + hugetlb_put_quota(inode, chg); return ret; } @@ -2922,7 +2925,7 @@ void hugetlb_unreserve_pages(struct inod inode->i_blocks -= (blocks_per_huge_page(h) * freed); spin_unlock(&inode->i_lock); - hugetlb_put_quota(inode->i_mapping, (chg - freed)); + hugetlb_put_quota(inode, (chg - freed)); hugetlb_acct_memory(h, -(chg - freed)); }
Linus, please apply hugetlbfs tracks the current usage of hugepages per hugetlbfs mountpoint. To correctly track this when hugepages are released, it must find the right hugetlbfs super_block from the struct page available in free_huge_page(). It does this by storing a pointer to the hugepage's struct address_space in the page_private data. The hugetlb_{get,put}_quota functions go from this mapping to the inode and thence to the super_block. However, this usage is buggy, because nothing ensures that the address_space is not freed before all the hugepages that belonged to it are. In practice that will usually be the case, but if extra page references have been taken by e.g. drivers or kvm doing get_user_pages() then the file, inode and address space may be destroyed before all the pages. In addition, the quota functions use the mapping only to get the inode then the super_block. However, most of the callers already have the inode anyway and have to get the mapping from there. This patch, therefore, stores a pointer to the inode instead of the address_space in the page private data for hugepages. More importantly it correctly adjusts the reference count on the inodes when they're added to the page private data. This ensures that the inode (and therefore the super block) will not be freed before we use it from free_huge_page. Signed-off-by: David Gibson <david@gibson.dropbear.id.au>