Message ID | 20210117151053.24600-6-songmuchun@bytedance.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Free some vmemmap pages of HugeTLB page | expand |
On Sun, 17 Jan 2021, Muchun Song wrote: > diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c > index ce4be1fa93c2..3b146d5949f3 100644 > --- a/mm/sparse-vmemmap.c > +++ b/mm/sparse-vmemmap.c > @@ -29,6 +29,7 @@ > #include <linux/sched.h> > #include <linux/pgtable.h> > #include <linux/bootmem_info.h> > +#include <linux/delay.h> > > #include <asm/dma.h> > #include <asm/pgalloc.h> > @@ -40,7 +41,8 @@ > * @remap_pte: called for each non-empty PTE (lowest-level) entry. > * @reuse_page: the page which is reused for the tail vmemmap pages. > * @reuse_addr: the virtual address of the @reuse_page page. > - * @vmemmap_pages: the list head of the vmemmap pages that can be freed. > + * @vmemmap_pages: the list head of the vmemmap pages that can be freed > + * or is mapped from. > */ > struct vmemmap_remap_walk { > void (*remap_pte)(pte_t *pte, unsigned long addr, > @@ -50,6 +52,10 @@ struct vmemmap_remap_walk { > struct list_head *vmemmap_pages; > }; > > +/* The gfp mask of allocating vmemmap page */ > +#define GFP_VMEMMAP_PAGE \ > + (GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN | __GFP_THISNODE) > + This is unnecessary, just use the gfp mask directly in allocator. > static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr, > unsigned long end, > struct vmemmap_remap_walk *walk) > @@ -228,6 +234,75 @@ void vmemmap_remap_free(unsigned long start, unsigned long end, > free_vmemmap_page_list(&vmemmap_pages); > } > > +static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, > + struct vmemmap_remap_walk *walk) > +{ > + pgprot_t pgprot = PAGE_KERNEL; > + struct page *page; > + void *to; > + > + BUG_ON(pte_page(*pte) != walk->reuse_page); > + > + page = list_first_entry(walk->vmemmap_pages, struct page, lru); > + list_del(&page->lru); > + to = page_to_virt(page); > + copy_page(to, (void *)walk->reuse_addr); > + > + set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot)); > +} > + > +static void alloc_vmemmap_page_list(struct list_head *list, > + unsigned long start, unsigned long end) > +{ > + unsigned long addr; > + > + for (addr = start; addr < end; addr += PAGE_SIZE) { > + struct page *page; > + int nid = page_to_nid((const void *)addr); > + > +retry: > + page = alloc_pages_node(nid, GFP_VMEMMAP_PAGE, 0); > + if (unlikely(!page)) { > + msleep(100); > + /* > + * We should retry infinitely, because we cannot > + * handle allocation failures. Once we allocate > + * vmemmap pages successfully, then we can free > + * a HugeTLB page. > + */ > + goto retry; Ugh, I don't think this will work, there's no guarantee that we'll ever succeed and now we can't free a 2MB hugepage because we cannot allocate a 4KB page. We absolutely have to ensure we make forward progress here. We're going to be freeing the hugetlb page after this succeeeds, can we not use part of the hugetlb page that we're freeing for this memory instead? > + } > + list_add_tail(&page->lru, list); > + } > +} > + > +/** > + * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end) > + * to the page which is from the @vmemmap_pages > + * respectively. > + * @start: start address of the vmemmap virtual address range. > + * @end: end address of the vmemmap virtual address range. > + * @reuse: reuse address. > + */ > +void vmemmap_remap_alloc(unsigned long start, unsigned long end, > + unsigned long reuse) > +{ > + LIST_HEAD(vmemmap_pages); > + struct vmemmap_remap_walk walk = { > + .remap_pte = vmemmap_restore_pte, > + .reuse_addr = reuse, > + .vmemmap_pages = &vmemmap_pages, > + }; > + > + might_sleep(); > + > + /* See the comment in the vmemmap_remap_free(). */ > + BUG_ON(start - reuse != PAGE_SIZE); > + > + alloc_vmemmap_page_list(&vmemmap_pages, start, end); > + vmemmap_remap_range(reuse, end, &walk); > +} > + > /* > * Allocate a block of memory to be used to back the virtual memory map > * or to back the page tables that are used to create the mapping. > -- > 2.11.0 > >
On Mon, Jan 25, 2021 at 8:05 AM David Rientjes <rientjes@google.com> wrote: > > > On Sun, 17 Jan 2021, Muchun Song wrote: > > > diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c > > index ce4be1fa93c2..3b146d5949f3 100644 > > --- a/mm/sparse-vmemmap.c > > +++ b/mm/sparse-vmemmap.c > > @@ -29,6 +29,7 @@ > > #include <linux/sched.h> > > #include <linux/pgtable.h> > > #include <linux/bootmem_info.h> > > +#include <linux/delay.h> > > > > #include <asm/dma.h> > > #include <asm/pgalloc.h> > > @@ -40,7 +41,8 @@ > > * @remap_pte: called for each non-empty PTE (lowest-level) entry. > > * @reuse_page: the page which is reused for the tail vmemmap pages. > > * @reuse_addr: the virtual address of the @reuse_page page. > > - * @vmemmap_pages: the list head of the vmemmap pages that can be freed. > > + * @vmemmap_pages: the list head of the vmemmap pages that can be freed > > + * or is mapped from. > > */ > > struct vmemmap_remap_walk { > > void (*remap_pte)(pte_t *pte, unsigned long addr, > > @@ -50,6 +52,10 @@ struct vmemmap_remap_walk { > > struct list_head *vmemmap_pages; > > }; > > > > +/* The gfp mask of allocating vmemmap page */ > > +#define GFP_VMEMMAP_PAGE \ > > + (GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN | __GFP_THISNODE) > > + > > This is unnecessary, just use the gfp mask directly in allocator. Will do. Thanks. > > > static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr, > > unsigned long end, > > struct vmemmap_remap_walk *walk) > > @@ -228,6 +234,75 @@ void vmemmap_remap_free(unsigned long start, unsigned long end, > > free_vmemmap_page_list(&vmemmap_pages); > > } > > > > +static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, > > + struct vmemmap_remap_walk *walk) > > +{ > > + pgprot_t pgprot = PAGE_KERNEL; > > + struct page *page; > > + void *to; > > + > > + BUG_ON(pte_page(*pte) != walk->reuse_page); > > + > > + page = list_first_entry(walk->vmemmap_pages, struct page, lru); > > + list_del(&page->lru); > > + to = page_to_virt(page); > > + copy_page(to, (void *)walk->reuse_addr); > > + > > + set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot)); > > +} > > + > > +static void alloc_vmemmap_page_list(struct list_head *list, > > + unsigned long start, unsigned long end) > > +{ > > + unsigned long addr; > > + > > + for (addr = start; addr < end; addr += PAGE_SIZE) { > > + struct page *page; > > + int nid = page_to_nid((const void *)addr); > > + > > +retry: > > + page = alloc_pages_node(nid, GFP_VMEMMAP_PAGE, 0); > > + if (unlikely(!page)) { > > + msleep(100); > > + /* > > + * We should retry infinitely, because we cannot > > + * handle allocation failures. Once we allocate > > + * vmemmap pages successfully, then we can free > > + * a HugeTLB page. > > + */ > > + goto retry; > > Ugh, I don't think this will work, there's no guarantee that we'll ever > succeed and now we can't free a 2MB hugepage because we cannot allocate a > 4KB page. We absolutely have to ensure we make forward progress here. This can trigger a OOM when there is no memory and kill someone to release some memory. Right? > > We're going to be freeing the hugetlb page after this succeeeds, can we > not use part of the hugetlb page that we're freeing for this memory > instead? It seems a good idea. We can try to allocate memory firstly, if successful, just use the new page to remap (it can reduce memory fragmentation). If not, we can use part of the hugetlb page to remap. What's your opinion about this? > > > + } > > + list_add_tail(&page->lru, list); > > + } > > +} > > + > > +/** > > + * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end) > > + * to the page which is from the @vmemmap_pages > > + * respectively. > > + * @start: start address of the vmemmap virtual address range. > > + * @end: end address of the vmemmap virtual address range. > > + * @reuse: reuse address. > > + */ > > +void vmemmap_remap_alloc(unsigned long start, unsigned long end, > > + unsigned long reuse) > > +{ > > + LIST_HEAD(vmemmap_pages); > > + struct vmemmap_remap_walk walk = { > > + .remap_pte = vmemmap_restore_pte, > > + .reuse_addr = reuse, > > + .vmemmap_pages = &vmemmap_pages, > > + }; > > + > > + might_sleep(); > > + > > + /* See the comment in the vmemmap_remap_free(). */ > > + BUG_ON(start - reuse != PAGE_SIZE); > > + > > + alloc_vmemmap_page_list(&vmemmap_pages, start, end); > > + vmemmap_remap_range(reuse, end, &walk); > > +} > > + > > /* > > * Allocate a block of memory to be used to back the virtual memory map > > * or to back the page tables that are used to create the mapping. > > -- > > 2.11.0 > > > >
On Mon, Jan 25, 2021 at 2:40 PM Muchun Song <songmuchun@bytedance.com> wrote: > > On Mon, Jan 25, 2021 at 8:05 AM David Rientjes <rientjes@google.com> wrote: > > > > > > On Sun, 17 Jan 2021, Muchun Song wrote: > > > > > diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c > > > index ce4be1fa93c2..3b146d5949f3 100644 > > > --- a/mm/sparse-vmemmap.c > > > +++ b/mm/sparse-vmemmap.c > > > @@ -29,6 +29,7 @@ > > > #include <linux/sched.h> > > > #include <linux/pgtable.h> > > > #include <linux/bootmem_info.h> > > > +#include <linux/delay.h> > > > > > > #include <asm/dma.h> > > > #include <asm/pgalloc.h> > > > @@ -40,7 +41,8 @@ > > > * @remap_pte: called for each non-empty PTE (lowest-level) entry. > > > * @reuse_page: the page which is reused for the tail vmemmap pages. > > > * @reuse_addr: the virtual address of the @reuse_page page. > > > - * @vmemmap_pages: the list head of the vmemmap pages that can be freed. > > > + * @vmemmap_pages: the list head of the vmemmap pages that can be freed > > > + * or is mapped from. > > > */ > > > struct vmemmap_remap_walk { > > > void (*remap_pte)(pte_t *pte, unsigned long addr, > > > @@ -50,6 +52,10 @@ struct vmemmap_remap_walk { > > > struct list_head *vmemmap_pages; > > > }; > > > > > > +/* The gfp mask of allocating vmemmap page */ > > > +#define GFP_VMEMMAP_PAGE \ > > > + (GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN | __GFP_THISNODE) > > > + > > > > This is unnecessary, just use the gfp mask directly in allocator. > > Will do. Thanks. > > > > > > static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr, > > > unsigned long end, > > > struct vmemmap_remap_walk *walk) > > > @@ -228,6 +234,75 @@ void vmemmap_remap_free(unsigned long start, unsigned long end, > > > free_vmemmap_page_list(&vmemmap_pages); > > > } > > > > > > +static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, > > > + struct vmemmap_remap_walk *walk) > > > +{ > > > + pgprot_t pgprot = PAGE_KERNEL; > > > + struct page *page; > > > + void *to; > > > + > > > + BUG_ON(pte_page(*pte) != walk->reuse_page); > > > + > > > + page = list_first_entry(walk->vmemmap_pages, struct page, lru); > > > + list_del(&page->lru); > > > + to = page_to_virt(page); > > > + copy_page(to, (void *)walk->reuse_addr); > > > + > > > + set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot)); > > > +} > > > + > > > +static void alloc_vmemmap_page_list(struct list_head *list, > > > + unsigned long start, unsigned long end) > > > +{ > > > + unsigned long addr; > > > + > > > + for (addr = start; addr < end; addr += PAGE_SIZE) { > > > + struct page *page; > > > + int nid = page_to_nid((const void *)addr); > > > + > > > +retry: > > > + page = alloc_pages_node(nid, GFP_VMEMMAP_PAGE, 0); > > > + if (unlikely(!page)) { > > > + msleep(100); > > > + /* > > > + * We should retry infinitely, because we cannot > > > + * handle allocation failures. Once we allocate > > > + * vmemmap pages successfully, then we can free > > > + * a HugeTLB page. > > > + */ > > > + goto retry; > > > > Ugh, I don't think this will work, there's no guarantee that we'll ever > > succeed and now we can't free a 2MB hugepage because we cannot allocate a > > 4KB page. We absolutely have to ensure we make forward progress here. > > This can trigger a OOM when there is no memory and kill someone to release > some memory. Right? > > > > > We're going to be freeing the hugetlb page after this succeeeds, can we > > not use part of the hugetlb page that we're freeing for this memory > > instead? > > It seems a good idea. We can try to allocate memory firstly, if successful, > just use the new page to remap (it can reduce memory fragmentation). > If not, we can use part of the hugetlb page to remap. What's your opinion > about this? If the HugeTLB page is a gigantic page which is allocated from CMA. In this case, we cannot use part of the hugetlb page to remap. Right? > > > > > > + } > > > + list_add_tail(&page->lru, list); > > > + } > > > +} > > > + > > > +/** > > > + * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end) > > > + * to the page which is from the @vmemmap_pages > > > + * respectively. > > > + * @start: start address of the vmemmap virtual address range. > > > + * @end: end address of the vmemmap virtual address range. > > > + * @reuse: reuse address. > > > + */ > > > +void vmemmap_remap_alloc(unsigned long start, unsigned long end, > > > + unsigned long reuse) > > > +{ > > > + LIST_HEAD(vmemmap_pages); > > > + struct vmemmap_remap_walk walk = { > > > + .remap_pte = vmemmap_restore_pte, > > > + .reuse_addr = reuse, > > > + .vmemmap_pages = &vmemmap_pages, > > > + }; > > > + > > > + might_sleep(); > > > + > > > + /* See the comment in the vmemmap_remap_free(). */ > > > + BUG_ON(start - reuse != PAGE_SIZE); > > > + > > > + alloc_vmemmap_page_list(&vmemmap_pages, start, end); > > > + vmemmap_remap_range(reuse, end, &walk); > > > +} > > > + > > > /* > > > * Allocate a block of memory to be used to back the virtual memory map > > > * or to back the page tables that are used to create the mapping. > > > -- > > > 2.11.0 > > > > > >
On 25.01.21 08:41, Muchun Song wrote: > On Mon, Jan 25, 2021 at 2:40 PM Muchun Song <songmuchun@bytedance.com> wrote: >> >> On Mon, Jan 25, 2021 at 8:05 AM David Rientjes <rientjes@google.com> wrote: >>> >>> >>> On Sun, 17 Jan 2021, Muchun Song wrote: >>> >>>> diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c >>>> index ce4be1fa93c2..3b146d5949f3 100644 >>>> --- a/mm/sparse-vmemmap.c >>>> +++ b/mm/sparse-vmemmap.c >>>> @@ -29,6 +29,7 @@ >>>> #include <linux/sched.h> >>>> #include <linux/pgtable.h> >>>> #include <linux/bootmem_info.h> >>>> +#include <linux/delay.h> >>>> >>>> #include <asm/dma.h> >>>> #include <asm/pgalloc.h> >>>> @@ -40,7 +41,8 @@ >>>> * @remap_pte: called for each non-empty PTE (lowest-level) entry. >>>> * @reuse_page: the page which is reused for the tail vmemmap pages. >>>> * @reuse_addr: the virtual address of the @reuse_page page. >>>> - * @vmemmap_pages: the list head of the vmemmap pages that can be freed. >>>> + * @vmemmap_pages: the list head of the vmemmap pages that can be freed >>>> + * or is mapped from. >>>> */ >>>> struct vmemmap_remap_walk { >>>> void (*remap_pte)(pte_t *pte, unsigned long addr, >>>> @@ -50,6 +52,10 @@ struct vmemmap_remap_walk { >>>> struct list_head *vmemmap_pages; >>>> }; >>>> >>>> +/* The gfp mask of allocating vmemmap page */ >>>> +#define GFP_VMEMMAP_PAGE \ >>>> + (GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN | __GFP_THISNODE) >>>> + >>> >>> This is unnecessary, just use the gfp mask directly in allocator. >> >> Will do. Thanks. >> >>> >>>> static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr, >>>> unsigned long end, >>>> struct vmemmap_remap_walk *walk) >>>> @@ -228,6 +234,75 @@ void vmemmap_remap_free(unsigned long start, unsigned long end, >>>> free_vmemmap_page_list(&vmemmap_pages); >>>> } >>>> >>>> +static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, >>>> + struct vmemmap_remap_walk *walk) >>>> +{ >>>> + pgprot_t pgprot = PAGE_KERNEL; >>>> + struct page *page; >>>> + void *to; >>>> + >>>> + BUG_ON(pte_page(*pte) != walk->reuse_page); >>>> + >>>> + page = list_first_entry(walk->vmemmap_pages, struct page, lru); >>>> + list_del(&page->lru); >>>> + to = page_to_virt(page); >>>> + copy_page(to, (void *)walk->reuse_addr); >>>> + >>>> + set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot)); >>>> +} >>>> + >>>> +static void alloc_vmemmap_page_list(struct list_head *list, >>>> + unsigned long start, unsigned long end) >>>> +{ >>>> + unsigned long addr; >>>> + >>>> + for (addr = start; addr < end; addr += PAGE_SIZE) { >>>> + struct page *page; >>>> + int nid = page_to_nid((const void *)addr); >>>> + >>>> +retry: >>>> + page = alloc_pages_node(nid, GFP_VMEMMAP_PAGE, 0); >>>> + if (unlikely(!page)) { >>>> + msleep(100); >>>> + /* >>>> + * We should retry infinitely, because we cannot >>>> + * handle allocation failures. Once we allocate >>>> + * vmemmap pages successfully, then we can free >>>> + * a HugeTLB page. >>>> + */ >>>> + goto retry; >>> >>> Ugh, I don't think this will work, there's no guarantee that we'll ever >>> succeed and now we can't free a 2MB hugepage because we cannot allocate a >>> 4KB page. We absolutely have to ensure we make forward progress here. >> >> This can trigger a OOM when there is no memory and kill someone to release >> some memory. Right? >> >>> >>> We're going to be freeing the hugetlb page after this succeeeds, can we >>> not use part of the hugetlb page that we're freeing for this memory >>> instead? >> >> It seems a good idea. We can try to allocate memory firstly, if successful, >> just use the new page to remap (it can reduce memory fragmentation). >> If not, we can use part of the hugetlb page to remap. What's your opinion >> about this? > > If the HugeTLB page is a gigantic page which is allocated from > CMA. In this case, we cannot use part of the hugetlb page to remap. > Right? Right; and I don't think the "reuse part of a huge page as vmemmap while freeing, while that part itself might not have a proper vmemmap yet (or might cover itself now)" is particularly straight forward. Maybe I'm wrong :) Also, watch out for huge pages on ZONE_MOVABLE, in that case you also shouldn't allocate the vmemmap from there ...
On Mon, Jan 25, 2021 at 5:15 PM David Hildenbrand <david@redhat.com> wrote: > > On 25.01.21 08:41, Muchun Song wrote: > > On Mon, Jan 25, 2021 at 2:40 PM Muchun Song <songmuchun@bytedance.com> wrote: > >> > >> On Mon, Jan 25, 2021 at 8:05 AM David Rientjes <rientjes@google.com> wrote: > >>> > >>> > >>> On Sun, 17 Jan 2021, Muchun Song wrote: > >>> > >>>> diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c > >>>> index ce4be1fa93c2..3b146d5949f3 100644 > >>>> --- a/mm/sparse-vmemmap.c > >>>> +++ b/mm/sparse-vmemmap.c > >>>> @@ -29,6 +29,7 @@ > >>>> #include <linux/sched.h> > >>>> #include <linux/pgtable.h> > >>>> #include <linux/bootmem_info.h> > >>>> +#include <linux/delay.h> > >>>> > >>>> #include <asm/dma.h> > >>>> #include <asm/pgalloc.h> > >>>> @@ -40,7 +41,8 @@ > >>>> * @remap_pte: called for each non-empty PTE (lowest-level) entry. > >>>> * @reuse_page: the page which is reused for the tail vmemmap pages. > >>>> * @reuse_addr: the virtual address of the @reuse_page page. > >>>> - * @vmemmap_pages: the list head of the vmemmap pages that can be freed. > >>>> + * @vmemmap_pages: the list head of the vmemmap pages that can be freed > >>>> + * or is mapped from. > >>>> */ > >>>> struct vmemmap_remap_walk { > >>>> void (*remap_pte)(pte_t *pte, unsigned long addr, > >>>> @@ -50,6 +52,10 @@ struct vmemmap_remap_walk { > >>>> struct list_head *vmemmap_pages; > >>>> }; > >>>> > >>>> +/* The gfp mask of allocating vmemmap page */ > >>>> +#define GFP_VMEMMAP_PAGE \ > >>>> + (GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN | __GFP_THISNODE) > >>>> + > >>> > >>> This is unnecessary, just use the gfp mask directly in allocator. > >> > >> Will do. Thanks. > >> > >>> > >>>> static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr, > >>>> unsigned long end, > >>>> struct vmemmap_remap_walk *walk) > >>>> @@ -228,6 +234,75 @@ void vmemmap_remap_free(unsigned long start, unsigned long end, > >>>> free_vmemmap_page_list(&vmemmap_pages); > >>>> } > >>>> > >>>> +static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, > >>>> + struct vmemmap_remap_walk *walk) > >>>> +{ > >>>> + pgprot_t pgprot = PAGE_KERNEL; > >>>> + struct page *page; > >>>> + void *to; > >>>> + > >>>> + BUG_ON(pte_page(*pte) != walk->reuse_page); > >>>> + > >>>> + page = list_first_entry(walk->vmemmap_pages, struct page, lru); > >>>> + list_del(&page->lru); > >>>> + to = page_to_virt(page); > >>>> + copy_page(to, (void *)walk->reuse_addr); > >>>> + > >>>> + set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot)); > >>>> +} > >>>> + > >>>> +static void alloc_vmemmap_page_list(struct list_head *list, > >>>> + unsigned long start, unsigned long end) > >>>> +{ > >>>> + unsigned long addr; > >>>> + > >>>> + for (addr = start; addr < end; addr += PAGE_SIZE) { > >>>> + struct page *page; > >>>> + int nid = page_to_nid((const void *)addr); > >>>> + > >>>> +retry: > >>>> + page = alloc_pages_node(nid, GFP_VMEMMAP_PAGE, 0); > >>>> + if (unlikely(!page)) { > >>>> + msleep(100); > >>>> + /* > >>>> + * We should retry infinitely, because we cannot > >>>> + * handle allocation failures. Once we allocate > >>>> + * vmemmap pages successfully, then we can free > >>>> + * a HugeTLB page. > >>>> + */ > >>>> + goto retry; > >>> > >>> Ugh, I don't think this will work, there's no guarantee that we'll ever > >>> succeed and now we can't free a 2MB hugepage because we cannot allocate a > >>> 4KB page. We absolutely have to ensure we make forward progress here. > >> > >> This can trigger a OOM when there is no memory and kill someone to release > >> some memory. Right? > >> > >>> > >>> We're going to be freeing the hugetlb page after this succeeeds, can we > >>> not use part of the hugetlb page that we're freeing for this memory > >>> instead? > >> > >> It seems a good idea. We can try to allocate memory firstly, if successful, > >> just use the new page to remap (it can reduce memory fragmentation). > >> If not, we can use part of the hugetlb page to remap. What's your opinion > >> about this? > > > > If the HugeTLB page is a gigantic page which is allocated from > > CMA. In this case, we cannot use part of the hugetlb page to remap. > > Right? > > Right; and I don't think the "reuse part of a huge page as vmemmap while > freeing, while that part itself might not have a proper vmemmap yet (or > might cover itself now)" is particularly straight forward. Maybe I'm > wrong :) > > Also, watch out for huge pages on ZONE_MOVABLE, in that case you also > shouldn't allocate the vmemmap from there ... Yeah, you are right. So I tend to trigger OOM to kill other processes to reclaim some memory when we allocate memory fails. > > -- > Thanks, > > David / dhildenb >
On 1/25/21 1:34 AM, Muchun Song wrote: > On Mon, Jan 25, 2021 at 5:15 PM David Hildenbrand <david@redhat.com> wrote: >> >> On 25.01.21 08:41, Muchun Song wrote: >>> On Mon, Jan 25, 2021 at 2:40 PM Muchun Song <songmuchun@bytedance.com> wrote: >>>> >>>> On Mon, Jan 25, 2021 at 8:05 AM David Rientjes <rientjes@google.com> wrote: >>>>> >>>>> >>>>> On Sun, 17 Jan 2021, Muchun Song wrote: >>>>> >>>>>> diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c >>>>>> index ce4be1fa93c2..3b146d5949f3 100644 >>>>>> --- a/mm/sparse-vmemmap.c >>>>>> +++ b/mm/sparse-vmemmap.c >>>>>> @@ -29,6 +29,7 @@ >>>>>> #include <linux/sched.h> >>>>>> #include <linux/pgtable.h> >>>>>> #include <linux/bootmem_info.h> >>>>>> +#include <linux/delay.h> >>>>>> >>>>>> #include <asm/dma.h> >>>>>> #include <asm/pgalloc.h> >>>>>> @@ -40,7 +41,8 @@ >>>>>> * @remap_pte: called for each non-empty PTE (lowest-level) entry. >>>>>> * @reuse_page: the page which is reused for the tail vmemmap pages. >>>>>> * @reuse_addr: the virtual address of the @reuse_page page. >>>>>> - * @vmemmap_pages: the list head of the vmemmap pages that can be freed. >>>>>> + * @vmemmap_pages: the list head of the vmemmap pages that can be freed >>>>>> + * or is mapped from. >>>>>> */ >>>>>> struct vmemmap_remap_walk { >>>>>> void (*remap_pte)(pte_t *pte, unsigned long addr, >>>>>> @@ -50,6 +52,10 @@ struct vmemmap_remap_walk { >>>>>> struct list_head *vmemmap_pages; >>>>>> }; >>>>>> >>>>>> +/* The gfp mask of allocating vmemmap page */ >>>>>> +#define GFP_VMEMMAP_PAGE \ >>>>>> + (GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN | __GFP_THISNODE) >>>>>> + >>>>> >>>>> This is unnecessary, just use the gfp mask directly in allocator. >>>> >>>> Will do. Thanks. >>>> >>>>> >>>>>> static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr, >>>>>> unsigned long end, >>>>>> struct vmemmap_remap_walk *walk) >>>>>> @@ -228,6 +234,75 @@ void vmemmap_remap_free(unsigned long start, unsigned long end, >>>>>> free_vmemmap_page_list(&vmemmap_pages); >>>>>> } >>>>>> >>>>>> +static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, >>>>>> + struct vmemmap_remap_walk *walk) >>>>>> +{ >>>>>> + pgprot_t pgprot = PAGE_KERNEL; >>>>>> + struct page *page; >>>>>> + void *to; >>>>>> + >>>>>> + BUG_ON(pte_page(*pte) != walk->reuse_page); >>>>>> + >>>>>> + page = list_first_entry(walk->vmemmap_pages, struct page, lru); >>>>>> + list_del(&page->lru); >>>>>> + to = page_to_virt(page); >>>>>> + copy_page(to, (void *)walk->reuse_addr); >>>>>> + >>>>>> + set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot)); >>>>>> +} >>>>>> + >>>>>> +static void alloc_vmemmap_page_list(struct list_head *list, >>>>>> + unsigned long start, unsigned long end) >>>>>> +{ >>>>>> + unsigned long addr; >>>>>> + >>>>>> + for (addr = start; addr < end; addr += PAGE_SIZE) { >>>>>> + struct page *page; >>>>>> + int nid = page_to_nid((const void *)addr); >>>>>> + >>>>>> +retry: >>>>>> + page = alloc_pages_node(nid, GFP_VMEMMAP_PAGE, 0); >>>>>> + if (unlikely(!page)) { >>>>>> + msleep(100); >>>>>> + /* >>>>>> + * We should retry infinitely, because we cannot >>>>>> + * handle allocation failures. Once we allocate >>>>>> + * vmemmap pages successfully, then we can free >>>>>> + * a HugeTLB page. >>>>>> + */ >>>>>> + goto retry; >>>>> >>>>> Ugh, I don't think this will work, there's no guarantee that we'll ever >>>>> succeed and now we can't free a 2MB hugepage because we cannot allocate a >>>>> 4KB page. We absolutely have to ensure we make forward progress here. >>>> >>>> This can trigger a OOM when there is no memory and kill someone to release >>>> some memory. Right? >>>> >>>>> >>>>> We're going to be freeing the hugetlb page after this succeeeds, can we >>>>> not use part of the hugetlb page that we're freeing for this memory >>>>> instead? >>>> >>>> It seems a good idea. We can try to allocate memory firstly, if successful, >>>> just use the new page to remap (it can reduce memory fragmentation). >>>> If not, we can use part of the hugetlb page to remap. What's your opinion >>>> about this? >>> >>> If the HugeTLB page is a gigantic page which is allocated from >>> CMA. In this case, we cannot use part of the hugetlb page to remap. >>> Right? >> >> Right; and I don't think the "reuse part of a huge page as vmemmap while >> freeing, while that part itself might not have a proper vmemmap yet (or >> might cover itself now)" is particularly straight forward. Maybe I'm >> wrong :) >> >> Also, watch out for huge pages on ZONE_MOVABLE, in that case you also >> shouldn't allocate the vmemmap from there ... > > Yeah, you are right. So I tend to trigger OOM to kill other processes to > reclaim some memory when we allocate memory fails. IIUC, even non-gigantic hugetlb pages can exist in CMA. They can be migrated out of CMA if needed (except free pages in the pool, but that is a separate issue David H already noted in another thread). When we first started discussing this patch set, one suggestion was to force hugetlb pool pages to be allocated at boot time and never permit them to be freed back to the buddy allocator. A primary reason for the suggestion was to avoid this issue of needing to allocate memory when freeing a hugetlb page to buddy. IMO, that would be an unreasonable restriction for many existing hugetlb use cases. A simple thought is that we simply fail the 'freeing hugetlb page to buddy' if we can not allocate the required vmemmap pages. However, as David R says freeing hugetlb pages to buddy is a reasonable way to free up memory in oom situations. However, failing the operation 'might' be better than looping forever trying to allocate the pages needed? As mentioned in the previous patch, it would be better to use GFP_ATOMIC to at least dip into reserves if we can. I think using pages of the hugetlb for vmemmap to cover pages of the hugetlb is the only way we can guarantee success of freeing a hugetlb page to buddy. However, this should only only be used when there is no other option and could result in vmemmap pages residing in CMA or ZONE_MOVABLE. I'm not sure how much better this is than failing the free to buddy operation. I don't have a solution. Just wanted to share some thoughts. BTW, just thought of something else. Consider offlining a memory section that contains a free hugetlb page. The offline code will try to disolve the hugetlb page (free to buddy). So, vmemmap pages will need to be allocated. We will try to allocate vmemap pages on the same node as the hugetlb page. But, if this memory section is the last of the node all the pages will have been isolated and no allocations will succeed. Is that a possible scenario, or am I just having too many negative thoughts?
On Mon, Jan 25, 2021 at 03:25:35PM -0800, Mike Kravetz wrote: > IIUC, even non-gigantic hugetlb pages can exist in CMA. They can be migrated > out of CMA if needed (except free pages in the pool, but that is a separate > issue David H already noted in another thread). Yeah, as discussed I am taking a look at that. > When we first started discussing this patch set, one suggestion was to force > hugetlb pool pages to be allocated at boot time and never permit them to be > freed back to the buddy allocator. A primary reason for the suggestion was > to avoid this issue of needing to allocate memory when freeing a hugetlb page > to buddy. IMO, that would be an unreasonable restriction for many existing > hugetlb use cases. AFAIK it was suggested as a way to simplify things in the first go of this patchset. Please note that the first versions of this patchset was dealing with PMD mapped vmemmap pages and overall it was quite convulated for a first version. Since then, things had simplified quite a lot (e.g: we went from 22 patches to 12), so I do not feel the need to force the pages to be allocated at boot time. > A simple thought is that we simply fail the 'freeing hugetlb page to buddy' > if we can not allocate the required vmemmap pages. However, as David R says > freeing hugetlb pages to buddy is a reasonable way to free up memory in oom > situations. However, failing the operation 'might' be better than looping > forever trying to allocate the pages needed? As mentioned in the previous > patch, it would be better to use GFP_ATOMIC to at least dip into reserves if > we can. I also agree that GFP_ATOMIC might make some sense. If the system is under memory pressure, I think it is best if we go the extra mile in order to free up to 4096 pages or 512 pages. Otherwise we might have a nice hugetlb page we might not need and a lack of memory. > I think using pages of the hugetlb for vmemmap to cover pages of the hugetlb > is the only way we can guarantee success of freeing a hugetlb page to buddy. > However, this should only only be used when there is no other option and could > result in vmemmap pages residing in CMA or ZONE_MOVABLE. I'm not sure how > much better this is than failing the free to buddy operation. And how would you tell when there is no other option? > I don't have a solution. Just wanted to share some thoughts. > > BTW, just thought of something else. Consider offlining a memory section that > contains a free hugetlb page. The offline code will try to disolve the hugetlb > page (free to buddy). So, vmemmap pages will need to be allocated. We will > try to allocate vmemap pages on the same node as the hugetlb page. But, if > this memory section is the last of the node all the pages will have been > isolated and no allocations will succeed. Is that a possible scenario, or am > I just having too many negative thoughts? IIUC, GFP_ATOMIC will reset ALLOC_CPUSET flags at some point and the nodemask will be cleared, so I guess the system will try to allocate from another node. But I am not sure about that one. I would like to hear Michal's thoughts on this.
On Sun, Jan 17, 2021 at 11:10:46PM +0800, Muchun Song wrote: > When we free a HugeTLB page to the buddy allocator, we should allocate the > vmemmap pages associated with it. We can do that in the __free_hugepage() > before freeing it to buddy. > > Signed-off-by: Muchun Song <songmuchun@bytedance.com> This series has grown a certain grade of madurity and improvment, but it seems to me that we have been stuck in this patch (and patch#4) for quite some time. Would it be acceptable for a first implementation to not let hugetlb pages to be freed when this feature is in use? This would simplify things for now, as we could get rid of patch#4 and patch#5. We can always extend functionality once this has been merged, right? Of course, this means that e.g: memory-hotplug (hot-remove) will not fully work when this in place, but well. I would like to hear what others think, but in my opinion it would be a big step to move on. > --- > include/linux/mm.h | 2 ++ > mm/hugetlb.c | 2 ++ > mm/hugetlb_vmemmap.c | 15 ++++++++++ > mm/hugetlb_vmemmap.h | 5 ++++ > mm/sparse-vmemmap.c | 77 +++++++++++++++++++++++++++++++++++++++++++++++++++- > 5 files changed, 100 insertions(+), 1 deletion(-) > > diff --git a/include/linux/mm.h b/include/linux/mm.h > index f928994ed273..16b55d13b0ab 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -3007,6 +3007,8 @@ static inline void print_vma_addr(char *prefix, unsigned long rip) > > void vmemmap_remap_free(unsigned long start, unsigned long end, > unsigned long reuse); > +void vmemmap_remap_alloc(unsigned long start, unsigned long end, > + unsigned long reuse); > > void *sparse_buffer_alloc(unsigned long size); > struct page * __populate_section_memmap(unsigned long pfn, > diff --git a/mm/hugetlb.c b/mm/hugetlb.c > index c165186ec2cf..d11c32fcdb38 100644 > --- a/mm/hugetlb.c > +++ b/mm/hugetlb.c > @@ -1326,6 +1326,8 @@ static void update_hpage_vmemmap_workfn(struct work_struct *work) > page->mapping = NULL; > h = page_hstate(page); > > + alloc_huge_page_vmemmap(h, page); > + > spin_lock(&hugetlb_lock); > __free_hugepage(h, page); > spin_unlock(&hugetlb_lock); > diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c > index 19f1898aaede..6108ae80314f 100644 > --- a/mm/hugetlb_vmemmap.c > +++ b/mm/hugetlb_vmemmap.c > @@ -183,6 +183,21 @@ static inline unsigned long free_vmemmap_pages_size_per_hpage(struct hstate *h) > return (unsigned long)free_vmemmap_pages_per_hpage(h) << PAGE_SHIFT; > } > > +void alloc_huge_page_vmemmap(struct hstate *h, struct page *head) > +{ > + unsigned long vmemmap_addr = (unsigned long)head; > + unsigned long vmemmap_end, vmemmap_reuse; > + > + if (!free_vmemmap_pages_per_hpage(h)) > + return; > + > + vmemmap_addr += RESERVE_VMEMMAP_SIZE; > + vmemmap_end = vmemmap_addr + free_vmemmap_pages_size_per_hpage(h); > + vmemmap_reuse = vmemmap_addr - PAGE_SIZE; > + > + vmemmap_remap_alloc(vmemmap_addr, vmemmap_end, vmemmap_reuse); > +} > + > void free_huge_page_vmemmap(struct hstate *h, struct page *head) > { > unsigned long vmemmap_addr = (unsigned long)head; > diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h > index 01f8637adbe0..b2c8d2f11d48 100644 > --- a/mm/hugetlb_vmemmap.h > +++ b/mm/hugetlb_vmemmap.h > @@ -11,6 +11,7 @@ > #include <linux/hugetlb.h> > > #ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP > +void alloc_huge_page_vmemmap(struct hstate *h, struct page *head); > void free_huge_page_vmemmap(struct hstate *h, struct page *head); > > /* > @@ -25,6 +26,10 @@ static inline unsigned int free_vmemmap_pages_per_hpage(struct hstate *h) > return 0; > } > #else > +static inline void alloc_huge_page_vmemmap(struct hstate *h, struct page *head) > +{ > +} > + > static inline void free_huge_page_vmemmap(struct hstate *h, struct page *head) > { > } > diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c > index ce4be1fa93c2..3b146d5949f3 100644 > --- a/mm/sparse-vmemmap.c > +++ b/mm/sparse-vmemmap.c > @@ -29,6 +29,7 @@ > #include <linux/sched.h> > #include <linux/pgtable.h> > #include <linux/bootmem_info.h> > +#include <linux/delay.h> > > #include <asm/dma.h> > #include <asm/pgalloc.h> > @@ -40,7 +41,8 @@ > * @remap_pte: called for each non-empty PTE (lowest-level) entry. > * @reuse_page: the page which is reused for the tail vmemmap pages. > * @reuse_addr: the virtual address of the @reuse_page page. > - * @vmemmap_pages: the list head of the vmemmap pages that can be freed. > + * @vmemmap_pages: the list head of the vmemmap pages that can be freed > + * or is mapped from. > */ > struct vmemmap_remap_walk { > void (*remap_pte)(pte_t *pte, unsigned long addr, > @@ -50,6 +52,10 @@ struct vmemmap_remap_walk { > struct list_head *vmemmap_pages; > }; > > +/* The gfp mask of allocating vmemmap page */ > +#define GFP_VMEMMAP_PAGE \ > + (GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN | __GFP_THISNODE) > + > static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr, > unsigned long end, > struct vmemmap_remap_walk *walk) > @@ -228,6 +234,75 @@ void vmemmap_remap_free(unsigned long start, unsigned long end, > free_vmemmap_page_list(&vmemmap_pages); > } > > +static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, > + struct vmemmap_remap_walk *walk) > +{ > + pgprot_t pgprot = PAGE_KERNEL; > + struct page *page; > + void *to; > + > + BUG_ON(pte_page(*pte) != walk->reuse_page); > + > + page = list_first_entry(walk->vmemmap_pages, struct page, lru); > + list_del(&page->lru); > + to = page_to_virt(page); > + copy_page(to, (void *)walk->reuse_addr); > + > + set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot)); > +} > + > +static void alloc_vmemmap_page_list(struct list_head *list, > + unsigned long start, unsigned long end) > +{ > + unsigned long addr; > + > + for (addr = start; addr < end; addr += PAGE_SIZE) { > + struct page *page; > + int nid = page_to_nid((const void *)addr); > + > +retry: > + page = alloc_pages_node(nid, GFP_VMEMMAP_PAGE, 0); > + if (unlikely(!page)) { > + msleep(100); > + /* > + * We should retry infinitely, because we cannot > + * handle allocation failures. Once we allocate > + * vmemmap pages successfully, then we can free > + * a HugeTLB page. > + */ > + goto retry; > + } > + list_add_tail(&page->lru, list); > + } > +} > + > +/** > + * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end) > + * to the page which is from the @vmemmap_pages > + * respectively. > + * @start: start address of the vmemmap virtual address range. > + * @end: end address of the vmemmap virtual address range. > + * @reuse: reuse address. > + */ > +void vmemmap_remap_alloc(unsigned long start, unsigned long end, > + unsigned long reuse) > +{ > + LIST_HEAD(vmemmap_pages); > + struct vmemmap_remap_walk walk = { > + .remap_pte = vmemmap_restore_pte, > + .reuse_addr = reuse, > + .vmemmap_pages = &vmemmap_pages, > + }; > + > + might_sleep(); > + > + /* See the comment in the vmemmap_remap_free(). */ > + BUG_ON(start - reuse != PAGE_SIZE); > + > + alloc_vmemmap_page_list(&vmemmap_pages, start, end); > + vmemmap_remap_range(reuse, end, &walk); > +} > + > /* > * Allocate a block of memory to be used to back the virtual memory map > * or to back the page tables that are used to create the mapping. > -- > 2.11.0 > >
On 26.01.21 10:29, Oscar Salvador wrote: > On Sun, Jan 17, 2021 at 11:10:46PM +0800, Muchun Song wrote: >> When we free a HugeTLB page to the buddy allocator, we should allocate the >> vmemmap pages associated with it. We can do that in the __free_hugepage() >> before freeing it to buddy. >> >> Signed-off-by: Muchun Song <songmuchun@bytedance.com> > > This series has grown a certain grade of madurity and improvment, but it seems > to me that we have been stuck in this patch (and patch#4) for quite some time. > > Would it be acceptable for a first implementation to not let hugetlb pages to > be freed when this feature is in use? > This would simplify things for now, as we could get rid of patch#4 and patch#5. > We can always extend functionality once this has been merged, right? I think either keep it completely simple (only free vmemmap of hugetlb pages allocated early during boot - which is what's not sufficient for some use cases) or implement the full thing properly (meaning, solve most challenging issues to get the basics running). I don't want to have some easy parts of complex features merged (e.g., breaking other stuff as you indicate below), and later finding out "it's not that easy" again and being stuck with it forever. > > Of course, this means that e.g: memory-hotplug (hot-remove) will not fully work > when this in place, but well. Can you elaborate? Are we're talking about having hugepages in ZONE_MOVABLE that are not migratable (and/or dissolvable) anymore? Than a clear NACK from my side.
On Tue, Jan 26, 2021 at 10:36:21AM +0100, David Hildenbrand wrote: > I think either keep it completely simple (only free vmemmap of hugetlb > pages allocated early during boot - which is what's not sufficient for > some use cases) or implement the full thing properly (meaning, solve > most challenging issues to get the basics running). > > I don't want to have some easy parts of complex features merged (e.g., > breaking other stuff as you indicate below), and later finding out "it's > not that easy" again and being stuck with it forever. Well, we could try to do an optimistic allocation, without tricky loopings. If that fails, refuse to shrink the pool at that moment. The user could always try to shrink it later via /proc/sys/vm/nr_hugepages interface. But I am just thinking out loud.. > > Of course, this means that e.g: memory-hotplug (hot-remove) will not fully work > > when this in place, but well. > > Can you elaborate? Are we're talking about having hugepages in > ZONE_MOVABLE that are not migratable (and/or dissolvable) anymore? Than > a clear NACK from my side. Pretty much, yeah.
On 26.01.21 15:58, Oscar Salvador wrote: > On Tue, Jan 26, 2021 at 10:36:21AM +0100, David Hildenbrand wrote: >> I think either keep it completely simple (only free vmemmap of hugetlb >> pages allocated early during boot - which is what's not sufficient for >> some use cases) or implement the full thing properly (meaning, solve >> most challenging issues to get the basics running). >> >> I don't want to have some easy parts of complex features merged (e.g., >> breaking other stuff as you indicate below), and later finding out "it's >> not that easy" again and being stuck with it forever. > > Well, we could try to do an optimistic allocation, without tricky loopings. > If that fails, refuse to shrink the pool at that moment. > > The user could always try to shrink it later via /proc/sys/vm/nr_hugepages > interface. > > But I am just thinking out loud.. The real issue seems to be discarding the vmemmap on any memory that has movability constraints - CMA and ZONE_MOVABLE; otherwise, as discussed, we can reuse parts of the thingy we're freeing for the vmemmap. Not that it would be ideal: that once-a-huge-page thing will never ever be a huge page again - but if it helps with OOM in corner cases, sure. Possible simplification: don't perform the optimization for now with free huge pages residing on ZONE_MOVABLE or CMA. Certainly not perfect: what happens when migrating a huge page from ZONE_NORMAL to (ZONE_MOVABLE|CMA)? > >>> Of course, this means that e.g: memory-hotplug (hot-remove) will not fully work >>> when this in place, but well. >> >> Can you elaborate? Are we're talking about having hugepages in >> ZONE_MOVABLE that are not migratable (and/or dissolvable) anymore? Than >> a clear NACK from my side. > > Pretty much, yeah. Note that we most likely soon have to tackle migrating/dissolving (free) hugetlbfs pages from alloc_contig_range() context - e.g., for CMA allocations. That's certainly something to keep in mind regarding any approaches that already break offline_pages().
On Tue, Jan 26, 2021 at 04:10:53PM +0100, David Hildenbrand wrote: > The real issue seems to be discarding the vmemmap on any memory that has > movability constraints - CMA and ZONE_MOVABLE; otherwise, as discussed, we > can reuse parts of the thingy we're freeing for the vmemmap. Not that it > would be ideal: that once-a-huge-page thing will never ever be a huge page > again - but if it helps with OOM in corner cases, sure. Yes, that is one way, but I am not sure how hard would it be to implement. Plus the fact that as you pointed out, once that memory is used for vmemmap array, we cannot use it again. Actually, we would fragment the memory eventually? > Possible simplification: don't perform the optimization for now with free > huge pages residing on ZONE_MOVABLE or CMA. Certainly not perfect: what > happens when migrating a huge page from ZONE_NORMAL to (ZONE_MOVABLE|CMA)? But if we do not allow theose pages to be in ZONE_MOVABLE or CMA, there is no point in migrate them, right? > > > > Of course, this means that e.g: memory-hotplug (hot-remove) will not fully work > > > > when this in place, but well. > > > > > > Can you elaborate? Are we're talking about having hugepages in > > > ZONE_MOVABLE that are not migratable (and/or dissolvable) anymore? Than > > > a clear NACK from my side. > > > > Pretty much, yeah. > > Note that we most likely soon have to tackle migrating/dissolving (free) > hugetlbfs pages from alloc_contig_range() context - e.g., for CMA > allocations. That's certainly something to keep in mind regarding any > approaches that already break offline_pages(). Definitely. I already talked to Mike about that and I am going to have a look into it pretty soon.
On 26.01.21 16:34, Oscar Salvador wrote: > On Tue, Jan 26, 2021 at 04:10:53PM +0100, David Hildenbrand wrote: >> The real issue seems to be discarding the vmemmap on any memory that has >> movability constraints - CMA and ZONE_MOVABLE; otherwise, as discussed, we >> can reuse parts of the thingy we're freeing for the vmemmap. Not that it >> would be ideal: that once-a-huge-page thing will never ever be a huge page >> again - but if it helps with OOM in corner cases, sure. > > Yes, that is one way, but I am not sure how hard would it be to implement. > Plus the fact that as you pointed out, once that memory is used for vmemmap > array, we cannot use it again. > Actually, we would fragment the memory eventually? > >> Possible simplification: don't perform the optimization for now with free >> huge pages residing on ZONE_MOVABLE or CMA. Certainly not perfect: what >> happens when migrating a huge page from ZONE_NORMAL to (ZONE_MOVABLE|CMA)? > > But if we do not allow theose pages to be in ZONE_MOVABLE or CMA, there is no > point in migrate them, right? Well, memory unplug "could" still work and migrate them and alloc_contig_range() "could in the future" still want to migrate them (virtio-mem, gigantic pages, powernv memtrace). Especially, the latter two don't work with ZONE_MOVABLE/CMA. But, I mean, it would be fair enough to say "there are no guarantees for alloc_contig_range()/offline_pages() with ZONE_NORMAL, so we can break these use cases when a magic switch is flipped and make these pages non-migratable anymore". I assume compaction doesn't care about huge pages either way, not sure about numa balancing etc. However, note that there is a fundamental issue with any approach that allocates a significant amount of unmovable memory for user-space purposes (excluding CMA allocations for unmovable stuff, CMA is special): pairing it with ZONE_MOVABLE becomes very tricky as your user space might just end up eating all kernel memory, although the system still looks like there is plenty of free memory residing in ZONE_MOVABLE. I mentioned that in the context of secretmem in a reduced form as well. We theoretically have that issue with dynamic allocation of gigantic pages, but it's something a user explicitly/rarely triggers and it can be documented to cause problems well enough. We'll have the same issue with GUP+ZONE_MOVABLE that Pavel is fixing right now - but GUP is already known to be broken in various ways and that it has to be treated in a special way. I'd like to limit the nasty corner cases. Of course, we could have smart rules like "don't online memory to ZONE_MOVABLE automatically when the magic switch is active". That's just ugly, but could work.
On 26.01.21 16:56, David Hildenbrand wrote: > On 26.01.21 16:34, Oscar Salvador wrote: >> On Tue, Jan 26, 2021 at 04:10:53PM +0100, David Hildenbrand wrote: >>> The real issue seems to be discarding the vmemmap on any memory that has >>> movability constraints - CMA and ZONE_MOVABLE; otherwise, as discussed, we >>> can reuse parts of the thingy we're freeing for the vmemmap. Not that it >>> would be ideal: that once-a-huge-page thing will never ever be a huge page >>> again - but if it helps with OOM in corner cases, sure. >> >> Yes, that is one way, but I am not sure how hard would it be to implement. >> Plus the fact that as you pointed out, once that memory is used for vmemmap >> array, we cannot use it again. >> Actually, we would fragment the memory eventually? >> >>> Possible simplification: don't perform the optimization for now with free >>> huge pages residing on ZONE_MOVABLE or CMA. Certainly not perfect: what >>> happens when migrating a huge page from ZONE_NORMAL to (ZONE_MOVABLE|CMA)? >> >> But if we do not allow theose pages to be in ZONE_MOVABLE or CMA, there is no >> point in migrate them, right? > > Well, memory unplug "could" still work and migrate them and > alloc_contig_range() "could in the future" still want to migrate them > (virtio-mem, gigantic pages, powernv memtrace). Especially, the latter > two don't work with ZONE_MOVABLE/CMA. But, I mean, it would be fair > enough to say "there are no guarantees for > alloc_contig_range()/offline_pages() with ZONE_NORMAL, so we can break > these use cases when a magic switch is flipped and make these pages > non-migratable anymore". > > I assume compaction doesn't care about huge pages either way, not sure > about numa balancing etc. > > > However, note that there is a fundamental issue with any approach that > allocates a significant amount of unmovable memory for user-space > purposes (excluding CMA allocations for unmovable stuff, CMA is > special): pairing it with ZONE_MOVABLE becomes very tricky as your user > space might just end up eating all kernel memory, although the system > still looks like there is plenty of free memory residing in > ZONE_MOVABLE. I mentioned that in the context of secretmem in a reduced > form as well. > > We theoretically have that issue with dynamic allocation of gigantic > pages, but it's something a user explicitly/rarely triggers and it can > be documented to cause problems well enough. We'll have the same issue > with GUP+ZONE_MOVABLE that Pavel is fixing right now - but GUP is > already known to be broken in various ways and that it has to be treated > in a special way. I'd like to limit the nasty corner cases. > > Of course, we could have smart rules like "don't online memory to > ZONE_MOVABLE automatically when the magic switch is active". That's just > ugly, but could work. > Extending on that, I just discovered that only x86-64, ppc64, and arm64 really support hugepage migration. Maybe one approach with the "magic switch" really would be to disable hugepage migration completely in hugepage_migration_supported(), and consequently making hugepage_movable_supported() always return false. Huge pages would never get placed onto ZONE_MOVABLE/CMA and cannot be migrated. The problem I describe would apply (careful with using ZONE_MOVABLE), but well, it can at least be documented.
On Wed, Jan 27, 2021 at 6:36 PM David Hildenbrand <david@redhat.com> wrote: > > On 26.01.21 16:56, David Hildenbrand wrote: > > On 26.01.21 16:34, Oscar Salvador wrote: > >> On Tue, Jan 26, 2021 at 04:10:53PM +0100, David Hildenbrand wrote: > >>> The real issue seems to be discarding the vmemmap on any memory that has > >>> movability constraints - CMA and ZONE_MOVABLE; otherwise, as discussed, we > >>> can reuse parts of the thingy we're freeing for the vmemmap. Not that it > >>> would be ideal: that once-a-huge-page thing will never ever be a huge page > >>> again - but if it helps with OOM in corner cases, sure. > >> > >> Yes, that is one way, but I am not sure how hard would it be to implement. > >> Plus the fact that as you pointed out, once that memory is used for vmemmap > >> array, we cannot use it again. > >> Actually, we would fragment the memory eventually? > >> > >>> Possible simplification: don't perform the optimization for now with free > >>> huge pages residing on ZONE_MOVABLE or CMA. Certainly not perfect: what > >>> happens when migrating a huge page from ZONE_NORMAL to (ZONE_MOVABLE|CMA)? > >> > >> But if we do not allow theose pages to be in ZONE_MOVABLE or CMA, there is no > >> point in migrate them, right? > > > > Well, memory unplug "could" still work and migrate them and > > alloc_contig_range() "could in the future" still want to migrate them > > (virtio-mem, gigantic pages, powernv memtrace). Especially, the latter > > two don't work with ZONE_MOVABLE/CMA. But, I mean, it would be fair > > enough to say "there are no guarantees for > > alloc_contig_range()/offline_pages() with ZONE_NORMAL, so we can break > > these use cases when a magic switch is flipped and make these pages > > non-migratable anymore". > > > > I assume compaction doesn't care about huge pages either way, not sure > > about numa balancing etc. > > > > > > However, note that there is a fundamental issue with any approach that > > allocates a significant amount of unmovable memory for user-space > > purposes (excluding CMA allocations for unmovable stuff, CMA is > > special): pairing it with ZONE_MOVABLE becomes very tricky as your user > > space might just end up eating all kernel memory, although the system > > still looks like there is plenty of free memory residing in > > ZONE_MOVABLE. I mentioned that in the context of secretmem in a reduced > > form as well. > > > > We theoretically have that issue with dynamic allocation of gigantic > > pages, but it's something a user explicitly/rarely triggers and it can > > be documented to cause problems well enough. We'll have the same issue > > with GUP+ZONE_MOVABLE that Pavel is fixing right now - but GUP is > > already known to be broken in various ways and that it has to be treated > > in a special way. I'd like to limit the nasty corner cases. > > > > Of course, we could have smart rules like "don't online memory to > > ZONE_MOVABLE automatically when the magic switch is active". That's just > > ugly, but could work. > > > > Extending on that, I just discovered that only x86-64, ppc64, and arm64 > really support hugepage migration. > > Maybe one approach with the "magic switch" really would be to disable > hugepage migration completely in hugepage_migration_supported(), and > consequently making hugepage_movable_supported() always return false. > > Huge pages would never get placed onto ZONE_MOVABLE/CMA and cannot be > migrated. The problem I describe would apply (careful with using > ZONE_MOVABLE), but well, it can at least be documented. Thanks for your explanation. All thinking seems to be introduced by encountering OOM. :-( In order to move forward and free the hugepage. We should add some restrictions below. 1. Only free the hugepage which is allocated from the ZONE_NORMAL. 2. Disable hugepage migration when this feature is enabled. 3. Using GFP_ATOMIC to allocate vmemmap pages firstly (it can reduce memory fragmentation), if it fails, we use part of the hugepage to remap. Hi Oscar, Mike and David H What's your opinion about this? Should we take this approach? Thanks. > > -- > Thanks, > > David / dhildenb >
On Thu, Jan 28, 2021 at 8:37 PM Muchun Song <songmuchun@bytedance.com> wrote: > > On Wed, Jan 27, 2021 at 6:36 PM David Hildenbrand <david@redhat.com> wrote: > > > > On 26.01.21 16:56, David Hildenbrand wrote: > > > On 26.01.21 16:34, Oscar Salvador wrote: > > >> On Tue, Jan 26, 2021 at 04:10:53PM +0100, David Hildenbrand wrote: > > >>> The real issue seems to be discarding the vmemmap on any memory that has > > >>> movability constraints - CMA and ZONE_MOVABLE; otherwise, as discussed, we > > >>> can reuse parts of the thingy we're freeing for the vmemmap. Not that it > > >>> would be ideal: that once-a-huge-page thing will never ever be a huge page > > >>> again - but if it helps with OOM in corner cases, sure. > > >> > > >> Yes, that is one way, but I am not sure how hard would it be to implement. > > >> Plus the fact that as you pointed out, once that memory is used for vmemmap > > >> array, we cannot use it again. > > >> Actually, we would fragment the memory eventually? > > >> > > >>> Possible simplification: don't perform the optimization for now with free > > >>> huge pages residing on ZONE_MOVABLE or CMA. Certainly not perfect: what > > >>> happens when migrating a huge page from ZONE_NORMAL to (ZONE_MOVABLE|CMA)? > > >> > > >> But if we do not allow theose pages to be in ZONE_MOVABLE or CMA, there is no > > >> point in migrate them, right? > > > > > > Well, memory unplug "could" still work and migrate them and > > > alloc_contig_range() "could in the future" still want to migrate them > > > (virtio-mem, gigantic pages, powernv memtrace). Especially, the latter > > > two don't work with ZONE_MOVABLE/CMA. But, I mean, it would be fair > > > enough to say "there are no guarantees for > > > alloc_contig_range()/offline_pages() with ZONE_NORMAL, so we can break > > > these use cases when a magic switch is flipped and make these pages > > > non-migratable anymore". > > > > > > I assume compaction doesn't care about huge pages either way, not sure > > > about numa balancing etc. > > > > > > > > > However, note that there is a fundamental issue with any approach that > > > allocates a significant amount of unmovable memory for user-space > > > purposes (excluding CMA allocations for unmovable stuff, CMA is > > > special): pairing it with ZONE_MOVABLE becomes very tricky as your user > > > space might just end up eating all kernel memory, although the system > > > still looks like there is plenty of free memory residing in > > > ZONE_MOVABLE. I mentioned that in the context of secretmem in a reduced > > > form as well. > > > > > > We theoretically have that issue with dynamic allocation of gigantic > > > pages, but it's something a user explicitly/rarely triggers and it can > > > be documented to cause problems well enough. We'll have the same issue > > > with GUP+ZONE_MOVABLE that Pavel is fixing right now - but GUP is > > > already known to be broken in various ways and that it has to be treated > > > in a special way. I'd like to limit the nasty corner cases. > > > > > > Of course, we could have smart rules like "don't online memory to > > > ZONE_MOVABLE automatically when the magic switch is active". That's just > > > ugly, but could work. > > > > > > > Extending on that, I just discovered that only x86-64, ppc64, and arm64 > > really support hugepage migration. > > > > Maybe one approach with the "magic switch" really would be to disable > > hugepage migration completely in hugepage_migration_supported(), and > > consequently making hugepage_movable_supported() always return false. > > > > Huge pages would never get placed onto ZONE_MOVABLE/CMA and cannot be > > migrated. The problem I describe would apply (careful with using > > ZONE_MOVABLE), but well, it can at least be documented. > > Thanks for your explanation. > > All thinking seems to be introduced by encountering OOM. :-( > > In order to move forward and free the hugepage. We should add some > restrictions below. > > 1. Only free the hugepage which is allocated from the ZONE_NORMAL. ^^ Sorry. Here "free" should be "optimize". > 2. Disable hugepage migration when this feature is enabled. > 3. Using GFP_ATOMIC to allocate vmemmap pages firstly (it can reduce > memory fragmentation), if it fails, we use part of the hugepage to > remap. > > Hi Oscar, Mike and David H > > What's your opinion about this? Should we take this approach? > > Thanks. > > > > > -- > > Thanks, > > > > David / dhildenb > >
On Wed, Jan 27, 2021 at 11:36:15AM +0100, David Hildenbrand wrote: > Extending on that, I just discovered that only x86-64, ppc64, and arm64 > really support hugepage migration. > > Maybe one approach with the "magic switch" really would be to disable > hugepage migration completely in hugepage_migration_supported(), and > consequently making hugepage_movable_supported() always return false. Ok, so migration would not fork for these pages, and since them would lay in !ZONE_MOVABLE there is no guarantee we can unplug the memory. Well, we really cannot unplug it unless the hugepage is not used (it can be dissolved at least). Now to the allocation-when-freeing. Current implementation uses GFP_ATOMIC(or wants to use) + forever loop. One of the problems I see with GFP_ATOMIC is that gives you access to memory reserves, but there are more users using those reserves. Then, worst-scenario case we need to allocate 16MB order-0 pages to free up 1GB hugepage, so the question would be whether reserves really scale to 16MB + more users accessing reserves. As I said, if anything I would go for an optimistic allocation-try , if we fail just refuse to shrink the pool. User can always try to shrink it later again via /sys interface. Since hugepages would not be longer in ZONE_MOVABLE/CMA and are not expected to be migratable, is that ok? Using the hugepage for the vmemmap array was brought up several times, but that would imply fragmenting memory over time. All in all seems to be overly complicated (I might be wrong). > Huge pages would never get placed onto ZONE_MOVABLE/CMA and cannot be > migrated. The problem I describe would apply (careful with using > ZONE_MOVABLE), but well, it can at least be documented. I am not a page allocator expert but cannot the allocation fallback to ZONE_MOVABLE under memory shortage on other zones?
On 1/28/21 4:37 AM, Muchun Song wrote: > On Wed, Jan 27, 2021 at 6:36 PM David Hildenbrand <david@redhat.com> wrote: >> >> On 26.01.21 16:56, David Hildenbrand wrote: >>> On 26.01.21 16:34, Oscar Salvador wrote: >>>> On Tue, Jan 26, 2021 at 04:10:53PM +0100, David Hildenbrand wrote: >>>>> The real issue seems to be discarding the vmemmap on any memory that has >>>>> movability constraints - CMA and ZONE_MOVABLE; otherwise, as discussed, we >>>>> can reuse parts of the thingy we're freeing for the vmemmap. Not that it >>>>> would be ideal: that once-a-huge-page thing will never ever be a huge page >>>>> again - but if it helps with OOM in corner cases, sure. >>>> >>>> Yes, that is one way, but I am not sure how hard would it be to implement. >>>> Plus the fact that as you pointed out, once that memory is used for vmemmap >>>> array, we cannot use it again. >>>> Actually, we would fragment the memory eventually? >>>> >>>>> Possible simplification: don't perform the optimization for now with free >>>>> huge pages residing on ZONE_MOVABLE or CMA. Certainly not perfect: what >>>>> happens when migrating a huge page from ZONE_NORMAL to (ZONE_MOVABLE|CMA)? >>>> >>>> But if we do not allow theose pages to be in ZONE_MOVABLE or CMA, there is no >>>> point in migrate them, right? >>> >>> Well, memory unplug "could" still work and migrate them and >>> alloc_contig_range() "could in the future" still want to migrate them >>> (virtio-mem, gigantic pages, powernv memtrace). Especially, the latter >>> two don't work with ZONE_MOVABLE/CMA. But, I mean, it would be fair >>> enough to say "there are no guarantees for >>> alloc_contig_range()/offline_pages() with ZONE_NORMAL, so we can break >>> these use cases when a magic switch is flipped and make these pages >>> non-migratable anymore". >>> >>> I assume compaction doesn't care about huge pages either way, not sure >>> about numa balancing etc. >>> >>> >>> However, note that there is a fundamental issue with any approach that >>> allocates a significant amount of unmovable memory for user-space >>> purposes (excluding CMA allocations for unmovable stuff, CMA is >>> special): pairing it with ZONE_MOVABLE becomes very tricky as your user >>> space might just end up eating all kernel memory, although the system >>> still looks like there is plenty of free memory residing in >>> ZONE_MOVABLE. I mentioned that in the context of secretmem in a reduced >>> form as well. >>> >>> We theoretically have that issue with dynamic allocation of gigantic >>> pages, but it's something a user explicitly/rarely triggers and it can >>> be documented to cause problems well enough. We'll have the same issue >>> with GUP+ZONE_MOVABLE that Pavel is fixing right now - but GUP is >>> already known to be broken in various ways and that it has to be treated >>> in a special way. I'd like to limit the nasty corner cases. >>> >>> Of course, we could have smart rules like "don't online memory to >>> ZONE_MOVABLE automatically when the magic switch is active". That's just >>> ugly, but could work. >>> >> >> Extending on that, I just discovered that only x86-64, ppc64, and arm64 >> really support hugepage migration. >> >> Maybe one approach with the "magic switch" really would be to disable >> hugepage migration completely in hugepage_migration_supported(), and >> consequently making hugepage_movable_supported() always return false. >> >> Huge pages would never get placed onto ZONE_MOVABLE/CMA and cannot be >> migrated. The problem I describe would apply (careful with using >> ZONE_MOVABLE), but well, it can at least be documented. > > Thanks for your explanation. > > All thinking seems to be introduced by encountering OOM. :-( Yes. Or, I think about it as the problem of not being able to dissolve (free to buddy) a hugetlb page. We can not dissolve because we can not allocate vmemmap for all sumpages. > In order to move forward and free the hugepage. We should add some > restrictions below. > > 1. Only free the hugepage which is allocated from the ZONE_NORMAL. Corrected: Only vmemmap optimize hugepages in ZONE_NORMAL > 2. Disable hugepage migration when this feature is enabled. I am not sure if we want to fully disable migration. I may be misunderstanding but the thought was to prevent migration between some movability types. It seems we should be able to migrate form ZONE_NORMAL to ZONE_NORMAL. Also, if we do allow huge pages without vmemmap optimization in MOVABLE or CMA then we should allow those to be migrated to NORMAL? Or is there a reason why we should prevent that. > 3. Using GFP_ATOMIC to allocate vmemmap pages firstly (it can reduce > memory fragmentation), if it fails, we use part of the hugepage to > remap. I honestly am not sure about this. This would only happen for pages in NORMAL. The only time using part of the huge page for vmemmap would help is if we are trying to dissolve huge pages to free up memory for other uses. > What's your opinion about this? Should we take this approach? I think trying to solve all the issues that could happen as the result of not being able to dissolve a hugetlb page has made this extremely complex. I know this is something we need to address/solve. We do not want to add more unexpected behavior in corner cases. However, I can not help but think about similar issues today. For example, if a huge page is in use in ZONE_MOVABLE or CMA there is no guarantee that it can be migrated today. Correct? We may need to allocate another huge page for the target of the migration, and there is no guarantee we can do that.
On Fri, Jan 29, 2021 at 6:29 AM Oscar Salvador <osalvador@suse.de> wrote: > > On Wed, Jan 27, 2021 at 11:36:15AM +0100, David Hildenbrand wrote: > > Extending on that, I just discovered that only x86-64, ppc64, and arm64 > > really support hugepage migration. > > > > Maybe one approach with the "magic switch" really would be to disable > > hugepage migration completely in hugepage_migration_supported(), and > > consequently making hugepage_movable_supported() always return false. > > Ok, so migration would not fork for these pages, and since them would > lay in !ZONE_MOVABLE there is no guarantee we can unplug the memory. > Well, we really cannot unplug it unless the hugepage is not used > (it can be dissolved at least). > > Now to the allocation-when-freeing. > Current implementation uses GFP_ATOMIC(or wants to use) + forever loop. > One of the problems I see with GFP_ATOMIC is that gives you access > to memory reserves, but there are more users using those reserves. > Then, worst-scenario case we need to allocate 16MB order-0 pages > to free up 1GB hugepage, so the question would be whether reserves > really scale to 16MB + more users accessing reserves. > > As I said, if anything I would go for an optimistic allocation-try > , if we fail just refuse to shrink the pool. > User can always try to shrink it later again via /sys interface. Yeah. It seems that this is the easy way to move on. Thanks. > > Since hugepages would not be longer in ZONE_MOVABLE/CMA and are not > expected to be migratable, is that ok? > > Using the hugepage for the vmemmap array was brought up several times, > but that would imply fragmenting memory over time. > > All in all seems to be overly complicated (I might be wrong). > > > > Huge pages would never get placed onto ZONE_MOVABLE/CMA and cannot be > > migrated. The problem I describe would apply (careful with using > > ZONE_MOVABLE), but well, it can at least be documented. > > I am not a page allocator expert but cannot the allocation fallback > to ZONE_MOVABLE under memory shortage on other zones? > > > -- > Oscar Salvador > SUSE L3
On Fri, Jan 29, 2021 at 9:04 AM Mike Kravetz <mike.kravetz@oracle.com> wrote: > > On 1/28/21 4:37 AM, Muchun Song wrote: > > On Wed, Jan 27, 2021 at 6:36 PM David Hildenbrand <david@redhat.com> wrote: > >> > >> On 26.01.21 16:56, David Hildenbrand wrote: > >>> On 26.01.21 16:34, Oscar Salvador wrote: > >>>> On Tue, Jan 26, 2021 at 04:10:53PM +0100, David Hildenbrand wrote: > >>>>> The real issue seems to be discarding the vmemmap on any memory that has > >>>>> movability constraints - CMA and ZONE_MOVABLE; otherwise, as discussed, we > >>>>> can reuse parts of the thingy we're freeing for the vmemmap. Not that it > >>>>> would be ideal: that once-a-huge-page thing will never ever be a huge page > >>>>> again - but if it helps with OOM in corner cases, sure. > >>>> > >>>> Yes, that is one way, but I am not sure how hard would it be to implement. > >>>> Plus the fact that as you pointed out, once that memory is used for vmemmap > >>>> array, we cannot use it again. > >>>> Actually, we would fragment the memory eventually? > >>>> > >>>>> Possible simplification: don't perform the optimization for now with free > >>>>> huge pages residing on ZONE_MOVABLE or CMA. Certainly not perfect: what > >>>>> happens when migrating a huge page from ZONE_NORMAL to (ZONE_MOVABLE|CMA)? > >>>> > >>>> But if we do not allow theose pages to be in ZONE_MOVABLE or CMA, there is no > >>>> point in migrate them, right? > >>> > >>> Well, memory unplug "could" still work and migrate them and > >>> alloc_contig_range() "could in the future" still want to migrate them > >>> (virtio-mem, gigantic pages, powernv memtrace). Especially, the latter > >>> two don't work with ZONE_MOVABLE/CMA. But, I mean, it would be fair > >>> enough to say "there are no guarantees for > >>> alloc_contig_range()/offline_pages() with ZONE_NORMAL, so we can break > >>> these use cases when a magic switch is flipped and make these pages > >>> non-migratable anymore". > >>> > >>> I assume compaction doesn't care about huge pages either way, not sure > >>> about numa balancing etc. > >>> > >>> > >>> However, note that there is a fundamental issue with any approach that > >>> allocates a significant amount of unmovable memory for user-space > >>> purposes (excluding CMA allocations for unmovable stuff, CMA is > >>> special): pairing it with ZONE_MOVABLE becomes very tricky as your user > >>> space might just end up eating all kernel memory, although the system > >>> still looks like there is plenty of free memory residing in > >>> ZONE_MOVABLE. I mentioned that in the context of secretmem in a reduced > >>> form as well. > >>> > >>> We theoretically have that issue with dynamic allocation of gigantic > >>> pages, but it's something a user explicitly/rarely triggers and it can > >>> be documented to cause problems well enough. We'll have the same issue > >>> with GUP+ZONE_MOVABLE that Pavel is fixing right now - but GUP is > >>> already known to be broken in various ways and that it has to be treated > >>> in a special way. I'd like to limit the nasty corner cases. > >>> > >>> Of course, we could have smart rules like "don't online memory to > >>> ZONE_MOVABLE automatically when the magic switch is active". That's just > >>> ugly, but could work. > >>> > >> > >> Extending on that, I just discovered that only x86-64, ppc64, and arm64 > >> really support hugepage migration. > >> > >> Maybe one approach with the "magic switch" really would be to disable > >> hugepage migration completely in hugepage_migration_supported(), and > >> consequently making hugepage_movable_supported() always return false. > >> > >> Huge pages would never get placed onto ZONE_MOVABLE/CMA and cannot be > >> migrated. The problem I describe would apply (careful with using > >> ZONE_MOVABLE), but well, it can at least be documented. > > > > Thanks for your explanation. > > > > All thinking seems to be introduced by encountering OOM. :-( > > Yes. Or, I think about it as the problem of not being able to dissolve (free > to buddy) a hugetlb page. We can not dissolve because we can not allocate > vmemmap for all sumpages. > > > In order to move forward and free the hugepage. We should add some > > restrictions below. > > > > 1. Only free the hugepage which is allocated from the ZONE_NORMAL. > Corrected: Only vmemmap optimize hugepages in ZONE_NORMAL > > > 2. Disable hugepage migration when this feature is enabled. > > I am not sure if we want to fully disable migration. I may be misunderstanding > but the thought was to prevent migration between some movability types. It > seems we should be able to migrate form ZONE_NORMAL to ZONE_NORMAL. > > Also, if we do allow huge pages without vmemmap optimization in MOVABLE or CMA > then we should allow those to be migrated to NORMAL? Or is there a reason why > we should prevent that. > > > 3. Using GFP_ATOMIC to allocate vmemmap pages firstly (it can reduce > > memory fragmentation), if it fails, we use part of the hugepage to > > remap. > > I honestly am not sure about this. This would only happen for pages in > NORMAL. The only time using part of the huge page for vmemmap would help is > if we are trying to dissolve huge pages to free up memory for other uses. > > > What's your opinion about this? Should we take this approach? > > I think trying to solve all the issues that could happen as the result of > not being able to dissolve a hugetlb page has made this extremely complex. > I know this is something we need to address/solve. We do not want to add > more unexpected behavior in corner cases. However, I can not help but think > about similar issues today. For example, if a huge page is in use in > ZONE_MOVABLE or CMA there is no guarantee that it can be migrated today. > Correct? We may need to allocate another huge page for the target of the > migration, and there is no guarantee we can do that. Yeah. Adding more restrictions makes things more complex. As you and Oscar said, refusing to free hugepage when allocating vmemmap pages fail may be an easy way now. > -- > Mike Kravetz
On 28.01.21 23:29, Oscar Salvador wrote: > On Wed, Jan 27, 2021 at 11:36:15AM +0100, David Hildenbrand wrote: >> Extending on that, I just discovered that only x86-64, ppc64, and arm64 >> really support hugepage migration. >> >> Maybe one approach with the "magic switch" really would be to disable >> hugepage migration completely in hugepage_migration_supported(), and >> consequently making hugepage_movable_supported() always return false. > > Ok, so migration would not fork for these pages, and since them would > lay in !ZONE_MOVABLE there is no guarantee we can unplug the memory. > Well, we really cannot unplug it unless the hugepage is not used > (it can be dissolved at least). > > Now to the allocation-when-freeing. > Current implementation uses GFP_ATOMIC(or wants to use) + forever loop. > One of the problems I see with GFP_ATOMIC is that gives you access > to memory reserves, but there are more users using those reserves. > Then, worst-scenario case we need to allocate 16MB order-0 pages > to free up 1GB hugepage, so the question would be whether reserves > really scale to 16MB + more users accessing reserves. > > As I said, if anything I would go for an optimistic allocation-try > , if we fail just refuse to shrink the pool. > User can always try to shrink it later again via /sys interface. > > Since hugepages would not be longer in ZONE_MOVABLE/CMA and are not > expected to be migratable, is that ok? > > Using the hugepage for the vmemmap array was brought up several times, > but that would imply fragmenting memory over time. > > All in all seems to be overly complicated (I might be wrong). > > >> Huge pages would never get placed onto ZONE_MOVABLE/CMA and cannot be >> migrated. The problem I describe would apply (careful with using >> ZONE_MOVABLE), but well, it can at least be documented. > > I am not a page allocator expert but cannot the allocation fallback > to ZONE_MOVABLE under memory shortage on other zones? No, for now it's not done. Only movable allocations target ZONE_MOVABLE. Doing so would be controversial: when would be the right point in time to start spilling unmovable allocations into CMA/ZONE_MOVABLE? You certainly want to try other things first (swapping, reclaim, compaction), before breaking any guarantees regarding hotunplug+migration/compaction you have with CMA/ZONE_MOVABLE. And even if you would allow it, your workload would already suffer extremely. So it smells more like a setup issue. But then, who knows when allocating huge pages (esp. at runtime) that there are such side effects before actually running into them? We can make sure that all relevant archs support migration of ordinary (!gigantic) huge pages (for now, only x86-64, ppc64/spapr, arm64), so we can place them onto ZONE_MOVABLE. It gets harder with more special cases. Gigantic pages (without CMA) are more of a general issue, but at least it's simple to document ("Careful when pairing ZONE_MOVABLE with gigantic pages on !CMA"). An unexpected high amount of unmovable memory is just extremely difficult to handle with ZONE_MOVABLE; it's hard for the user/admin to figure out that such restrictions actually apply.
>> What's your opinion about this? Should we take this approach? > > I think trying to solve all the issues that could happen as the result of > not being able to dissolve a hugetlb page has made this extremely complex. > I know this is something we need to address/solve. We do not want to add > more unexpected behavior in corner cases. However, I can not help but think > about similar issues today. For example, if a huge page is in use in > ZONE_MOVABLE or CMA there is no guarantee that it can be migrated today. Yes, hugetlbfs is broken with alloc_contig_range() as e.g., used by CMA and needs fixing. Then, similar problems as with hugetlbfs pages on ZONE_MOVABLE apply. hugetlbfs pages on ZONE_MOVABLE for memory unplug are problematic in corner cases only I think: 1. Not sufficient memory to allocate a destination page. Well, nothing we can really do about that - just like trying to migrate any other memory but running into -ENOMEM. 2. Trying to dissolve a free huge page but running into reservation limits. I think we should at least try allocating a new free huge page before failing. To be tackled in the future. > Correct? We may need to allocate another huge page for the target of the > migration, and there is no guarantee we can do that. > I agree that 1. is similar to "cannot migrate because OOM". So thinking about it again, we don't actually seem to lose that much when a) Rejecting migration of a huge page when not being able to allocate the vmemmap for our source page. Our system seems to be under quite some memory pressure already. Migration could just fail because we fail to allocate a migration target already. b) Rejecting to dissolve a huge page when not able to allocate the vmemmap. Dissolving can fail already. And, again, our system seems to be under quite some memory pressure already. c) Rejecting freeing huge pages when not able to allocate the vmemmap. I guess the "only" surprise is that the user might now no longer get what he asked for. This seems to be the "real change". So maybe little actually speaks against allowing for migration of such huge pages and optimizing any huge page, besides rejecting freeing of huge pages and surprising the user/admin. I guess while our system is under memory pressure CMA and ZONE_MOVABLE are already no longer able to always keep their guarantees - until there is no more memory pressure.
On 2/1/21 8:10 AM, David Hildenbrand wrote: >>> What's your opinion about this? Should we take this approach? >> >> I think trying to solve all the issues that could happen as the result of >> not being able to dissolve a hugetlb page has made this extremely complex. >> I know this is something we need to address/solve. We do not want to add >> more unexpected behavior in corner cases. However, I can not help but think >> about similar issues today. For example, if a huge page is in use in >> ZONE_MOVABLE or CMA there is no guarantee that it can be migrated today. > > Yes, hugetlbfs is broken with alloc_contig_range() as e.g., used by CMA and needs fixing. Then, similar problems as with hugetlbfs pages on ZONE_MOVABLE apply. > > > hugetlbfs pages on ZONE_MOVABLE for memory unplug are problematic in corner cases only I think: > > 1. Not sufficient memory to allocate a destination page. Well, nothing we can really do about that - just like trying to migrate any other memory but running into -ENOMEM. > > 2. Trying to dissolve a free huge page but running into reservation limits. I think we should at least try allocating a new free huge page before failing. To be tackled in the future. > >> Correct? We may need to allocate another huge page for the target of the >> migration, and there is no guarantee we can do that. >> > > I agree that 1. is similar to "cannot migrate because OOM". > > > So thinking about it again, we don't actually seem to lose that much when > > a) Rejecting migration of a huge page when not being able to allocate the vmemmap for our source page. Our system seems to be under quite some memory pressure already. Migration could just fail because we fail to allocate a migration target already. > > b) Rejecting to dissolve a huge page when not able to allocate the vmemmap. Dissolving can fail already. And, again, our system seems to be under quite some memory pressure already. > > c) Rejecting freeing huge pages when not able to allocate the vmemmap. I guess the "only" surprise is that the user might now no longer get what he asked for. This seems to be the "real change". > > So maybe little actually speaks against allowing for migration of such huge pages and optimizing any huge page, besides rejecting freeing of huge pages and surprising the user/admin. > > I guess while our system is under memory pressure CMA and ZONE_MOVABLE are already no longer able to always keep their guarantees - until there is no more memory pressure. > My thinking was similar. Failing to dissolve a hugetlb page because we could not allocate vmmemmap pages is not much/any worse than what we do when near OOM conditions today. As for surprising the user/admin, we should certainly log a warning if we can not dissolve a hugetlb page. One point David R brought up still is a bit concerning. When getting close to OOM, there may be users/code that will try to dissolve free hugetlb pages to give back as much memory as possible to buddy. I've seen users holding 'big chunks' of memory for a specific purpose and dumping them when needed. They were not doing this with hugetlb pages, but nothing would surprise me. In this series, vmmap freeing is 'opt in' at boot time. I would expect the use cases that want to opt in rarely if ever free/dissolve hugetlb pages. But, I could be wrong.
diff --git a/include/linux/mm.h b/include/linux/mm.h index f928994ed273..16b55d13b0ab 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3007,6 +3007,8 @@ static inline void print_vma_addr(char *prefix, unsigned long rip) void vmemmap_remap_free(unsigned long start, unsigned long end, unsigned long reuse); +void vmemmap_remap_alloc(unsigned long start, unsigned long end, + unsigned long reuse); void *sparse_buffer_alloc(unsigned long size); struct page * __populate_section_memmap(unsigned long pfn, diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c165186ec2cf..d11c32fcdb38 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1326,6 +1326,8 @@ static void update_hpage_vmemmap_workfn(struct work_struct *work) page->mapping = NULL; h = page_hstate(page); + alloc_huge_page_vmemmap(h, page); + spin_lock(&hugetlb_lock); __free_hugepage(h, page); spin_unlock(&hugetlb_lock); diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 19f1898aaede..6108ae80314f 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -183,6 +183,21 @@ static inline unsigned long free_vmemmap_pages_size_per_hpage(struct hstate *h) return (unsigned long)free_vmemmap_pages_per_hpage(h) << PAGE_SHIFT; } +void alloc_huge_page_vmemmap(struct hstate *h, struct page *head) +{ + unsigned long vmemmap_addr = (unsigned long)head; + unsigned long vmemmap_end, vmemmap_reuse; + + if (!free_vmemmap_pages_per_hpage(h)) + return; + + vmemmap_addr += RESERVE_VMEMMAP_SIZE; + vmemmap_end = vmemmap_addr + free_vmemmap_pages_size_per_hpage(h); + vmemmap_reuse = vmemmap_addr - PAGE_SIZE; + + vmemmap_remap_alloc(vmemmap_addr, vmemmap_end, vmemmap_reuse); +} + void free_huge_page_vmemmap(struct hstate *h, struct page *head) { unsigned long vmemmap_addr = (unsigned long)head; diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h index 01f8637adbe0..b2c8d2f11d48 100644 --- a/mm/hugetlb_vmemmap.h +++ b/mm/hugetlb_vmemmap.h @@ -11,6 +11,7 @@ #include <linux/hugetlb.h> #ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP +void alloc_huge_page_vmemmap(struct hstate *h, struct page *head); void free_huge_page_vmemmap(struct hstate *h, struct page *head); /* @@ -25,6 +26,10 @@ static inline unsigned int free_vmemmap_pages_per_hpage(struct hstate *h) return 0; } #else +static inline void alloc_huge_page_vmemmap(struct hstate *h, struct page *head) +{ +} + static inline void free_huge_page_vmemmap(struct hstate *h, struct page *head) { } diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index ce4be1fa93c2..3b146d5949f3 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -29,6 +29,7 @@ #include <linux/sched.h> #include <linux/pgtable.h> #include <linux/bootmem_info.h> +#include <linux/delay.h> #include <asm/dma.h> #include <asm/pgalloc.h> @@ -40,7 +41,8 @@ * @remap_pte: called for each non-empty PTE (lowest-level) entry. * @reuse_page: the page which is reused for the tail vmemmap pages. * @reuse_addr: the virtual address of the @reuse_page page. - * @vmemmap_pages: the list head of the vmemmap pages that can be freed. + * @vmemmap_pages: the list head of the vmemmap pages that can be freed + * or is mapped from. */ struct vmemmap_remap_walk { void (*remap_pte)(pte_t *pte, unsigned long addr, @@ -50,6 +52,10 @@ struct vmemmap_remap_walk { struct list_head *vmemmap_pages; }; +/* The gfp mask of allocating vmemmap page */ +#define GFP_VMEMMAP_PAGE \ + (GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN | __GFP_THISNODE) + static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct vmemmap_remap_walk *walk) @@ -228,6 +234,75 @@ void vmemmap_remap_free(unsigned long start, unsigned long end, free_vmemmap_page_list(&vmemmap_pages); } +static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, + struct vmemmap_remap_walk *walk) +{ + pgprot_t pgprot = PAGE_KERNEL; + struct page *page; + void *to; + + BUG_ON(pte_page(*pte) != walk->reuse_page); + + page = list_first_entry(walk->vmemmap_pages, struct page, lru); + list_del(&page->lru); + to = page_to_virt(page); + copy_page(to, (void *)walk->reuse_addr); + + set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot)); +} + +static void alloc_vmemmap_page_list(struct list_head *list, + unsigned long start, unsigned long end) +{ + unsigned long addr; + + for (addr = start; addr < end; addr += PAGE_SIZE) { + struct page *page; + int nid = page_to_nid((const void *)addr); + +retry: + page = alloc_pages_node(nid, GFP_VMEMMAP_PAGE, 0); + if (unlikely(!page)) { + msleep(100); + /* + * We should retry infinitely, because we cannot + * handle allocation failures. Once we allocate + * vmemmap pages successfully, then we can free + * a HugeTLB page. + */ + goto retry; + } + list_add_tail(&page->lru, list); + } +} + +/** + * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end) + * to the page which is from the @vmemmap_pages + * respectively. + * @start: start address of the vmemmap virtual address range. + * @end: end address of the vmemmap virtual address range. + * @reuse: reuse address. + */ +void vmemmap_remap_alloc(unsigned long start, unsigned long end, + unsigned long reuse) +{ + LIST_HEAD(vmemmap_pages); + struct vmemmap_remap_walk walk = { + .remap_pte = vmemmap_restore_pte, + .reuse_addr = reuse, + .vmemmap_pages = &vmemmap_pages, + }; + + might_sleep(); + + /* See the comment in the vmemmap_remap_free(). */ + BUG_ON(start - reuse != PAGE_SIZE); + + alloc_vmemmap_page_list(&vmemmap_pages, start, end); + vmemmap_remap_range(reuse, end, &walk); +} + /* * Allocate a block of memory to be used to back the virtual memory map * or to back the page tables that are used to create the mapping.
When we free a HugeTLB page to the buddy allocator, we should allocate the vmemmap pages associated with it. We can do that in the __free_hugepage() before freeing it to buddy. Signed-off-by: Muchun Song <songmuchun@bytedance.com> --- include/linux/mm.h | 2 ++ mm/hugetlb.c | 2 ++ mm/hugetlb_vmemmap.c | 15 ++++++++++ mm/hugetlb_vmemmap.h | 5 ++++ mm/sparse-vmemmap.c | 77 +++++++++++++++++++++++++++++++++++++++++++++++++++- 5 files changed, 100 insertions(+), 1 deletion(-)