diff mbox series

[v9,04/10] mm: thp: Support allocation of anonymous multi-size THP

Message ID 20231207161211.2374093-5-ryan.roberts@arm.com (mailing list archive)
State New
Headers show
Series Multi-size THP for anonymous memory | expand

Commit Message

Ryan Roberts Dec. 7, 2023, 4:12 p.m. UTC
Introduce the logic to allow THP to be configured (through the new sysfs
interface we just added) to allocate large folios to back anonymous
memory, which are larger than the base page size but smaller than
PMD-size. We call this new THP extension "multi-size THP" (mTHP).

mTHP continues to be PTE-mapped, but in many cases can still provide
similar benefits to traditional PMD-sized THP: Page faults are
significantly reduced (by a factor of e.g. 4, 8, 16, etc. depending on
the configured order), but latency spikes are much less prominent
because the size of each page isn't as huge as the PMD-sized variant and
there is less memory to clear in each page fault. The number of per-page
operations (e.g. ref counting, rmap management, lru list management) are
also significantly reduced since those ops now become per-folio.

Some architectures also employ TLB compression mechanisms to squeeze
more entries in when a set of PTEs are virtually and physically
contiguous and approporiately aligned. In this case, TLB misses will
occur less often.

The new behaviour is disabled by default, but can be enabled at runtime
by writing to /sys/kernel/mm/transparent_hugepage/hugepage-XXkb/enabled
(see documentation in previous commit). The long term aim is to change
the default to include suitable lower orders, but there are some risks
around internal fragmentation that need to be better understood first.

Tested-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Tested-by: John Hubbard <jhubbard@nvidia.com>
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
---
 include/linux/huge_mm.h |   6 ++-
 mm/memory.c             | 111 ++++++++++++++++++++++++++++++++++++----
 2 files changed, 106 insertions(+), 11 deletions(-)

Comments

David Hildenbrand Dec. 12, 2023, 3:02 p.m. UTC | #1
On 07.12.23 17:12, Ryan Roberts wrote:
> Introduce the logic to allow THP to be configured (through the new sysfs
> interface we just added) to allocate large folios to back anonymous
> memory, which are larger than the base page size but smaller than
> PMD-size. We call this new THP extension "multi-size THP" (mTHP).
> 
> mTHP continues to be PTE-mapped, but in many cases can still provide
> similar benefits to traditional PMD-sized THP: Page faults are
> significantly reduced (by a factor of e.g. 4, 8, 16, etc. depending on
> the configured order), but latency spikes are much less prominent
> because the size of each page isn't as huge as the PMD-sized variant and
> there is less memory to clear in each page fault. The number of per-page
> operations (e.g. ref counting, rmap management, lru list management) are
> also significantly reduced since those ops now become per-folio.

I'll note that with always-pte-mapped-thp it will be much easier to support
incremental page clearing (e.g., zero only parts of the folio and map the
remainder in a pro-non-like fashion whereby we'll zero on the next page fault).
With a PMD-sized thp, you have to eventually place/rip out page tables to
achieve that.

> 
> Some architectures also employ TLB compression mechanisms to squeeze
> more entries in when a set of PTEs are virtually and physically
> contiguous and approporiately aligned. In this case, TLB misses will
> occur less often.
> 
> The new behaviour is disabled by default, but can be enabled at runtime
> by writing to /sys/kernel/mm/transparent_hugepage/hugepage-XXkb/enabled
> (see documentation in previous commit). The long term aim is to change
> the default to include suitable lower orders, but there are some risks
> around internal fragmentation that need to be better understood first.
> 
> Tested-by: Kefeng Wang <wangkefeng.wang@huawei.com>
> Tested-by: John Hubbard <jhubbard@nvidia.com>
> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
> ---
>   include/linux/huge_mm.h |   6 ++-
>   mm/memory.c             | 111 ++++++++++++++++++++++++++++++++++++----
>   2 files changed, 106 insertions(+), 11 deletions(-)
> 
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index 609c153bae57..fa7a38a30fc6 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -68,9 +68,11 @@ extern struct kobj_attribute shmem_enabled_attr;
>   #define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)

[...]

> +
> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> +static struct folio *alloc_anon_folio(struct vm_fault *vmf)
> +{
> +	struct vm_area_struct *vma = vmf->vma;
> +	unsigned long orders;
> +	struct folio *folio;
> +	unsigned long addr;
> +	pte_t *pte;
> +	gfp_t gfp;
> +	int order;
> +
> +	/*
> +	 * If uffd is active for the vma we need per-page fault fidelity to
> +	 * maintain the uffd semantics.
> +	 */
> +	if (unlikely(userfaultfd_armed(vma)))
> +		goto fallback;
> +
> +	/*
> +	 * Get a list of all the (large) orders below PMD_ORDER that are enabled
> +	 * for this vma. Then filter out the orders that can't be allocated over
> +	 * the faulting address and still be fully contained in the vma.
> +	 */
> +	orders = thp_vma_allowable_orders(vma, vma->vm_flags, false, true, true,
> +					  BIT(PMD_ORDER) - 1);
> +	orders = thp_vma_suitable_orders(vma, vmf->address, orders);
> +
> +	if (!orders)
> +		goto fallback;
> +
> +	pte = pte_offset_map(vmf->pmd, vmf->address & PMD_MASK);
> +	if (!pte)
> +		return ERR_PTR(-EAGAIN);
> +
> +	/*
> +	 * Find the highest order where the aligned range is completely
> +	 * pte_none(). Note that all remaining orders will be completely
> +	 * pte_none().
> +	 */
> +	order = highest_order(orders);
> +	while (orders) {
> +		addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
> +		if (pte_range_none(pte + pte_index(addr), 1 << order))
> +			break;
> +		order = next_order(&orders, order);
> +	}
> +
> +	pte_unmap(pte);
> +
> +	/* Try allocating the highest of the remaining orders. */
> +	gfp = vma_thp_gfp_mask(vma);
> +	while (orders) {
> +		addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
> +		folio = vma_alloc_folio(gfp, order, vma, addr, true);
> +		if (folio) {
> +			clear_huge_page(&folio->page, vmf->address, 1 << order);
> +			return folio;
> +		}
> +		order = next_order(&orders, order);
> +	}
> +
> +fallback:
> +	return vma_alloc_zeroed_movable_folio(vma, vmf->address);
> +}
> +#else
> +#define alloc_anon_folio(vmf) \
> +		vma_alloc_zeroed_movable_folio((vmf)->vma, (vmf)->address)
> +#endif

A neater alternative might be

static struct folio *alloc_anon_folio(struct vm_fault *vmf)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
	/* magic */
fallback:
#endif
	return vma_alloc_zeroed_movable_folio((vmf)->vma, (vmf)->address):
}

[...]

Acked-by: David Hildenbrand <david@redhat.com>
Ryan Roberts Dec. 12, 2023, 3:38 p.m. UTC | #2
On 12/12/2023 15:02, David Hildenbrand wrote:
> On 07.12.23 17:12, Ryan Roberts wrote:
>> Introduce the logic to allow THP to be configured (through the new sysfs
>> interface we just added) to allocate large folios to back anonymous
>> memory, which are larger than the base page size but smaller than
>> PMD-size. We call this new THP extension "multi-size THP" (mTHP).
>>
>> mTHP continues to be PTE-mapped, but in many cases can still provide
>> similar benefits to traditional PMD-sized THP: Page faults are
>> significantly reduced (by a factor of e.g. 4, 8, 16, etc. depending on
>> the configured order), but latency spikes are much less prominent
>> because the size of each page isn't as huge as the PMD-sized variant and
>> there is less memory to clear in each page fault. The number of per-page
>> operations (e.g. ref counting, rmap management, lru list management) are
>> also significantly reduced since those ops now become per-folio.
> 
> I'll note that with always-pte-mapped-thp it will be much easier to support
> incremental page clearing (e.g., zero only parts of the folio and map the
> remainder in a pro-non-like fashion whereby we'll zero on the next page fault).
> With a PMD-sized thp, you have to eventually place/rip out page tables to
> achieve that.

But then you lose the benefits of reduced number of page faults; reducing page
faults gives a big speed up for workloads with lots of short lived processes
like compiling.

But yes, I agree this could be an interesting future optimization for some
workloads.

> 
>>
>> Some architectures also employ TLB compression mechanisms to squeeze
>> more entries in when a set of PTEs are virtually and physically
>> contiguous and approporiately aligned. In this case, TLB misses will
>> occur less often.
>>
>> The new behaviour is disabled by default, but can be enabled at runtime
>> by writing to /sys/kernel/mm/transparent_hugepage/hugepage-XXkb/enabled
>> (see documentation in previous commit). The long term aim is to change
>> the default to include suitable lower orders, but there are some risks
>> around internal fragmentation that need to be better understood first.
>>
>> Tested-by: Kefeng Wang <wangkefeng.wang@huawei.com>
>> Tested-by: John Hubbard <jhubbard@nvidia.com>
>> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
>> ---
>>   include/linux/huge_mm.h |   6 ++-
>>   mm/memory.c             | 111 ++++++++++++++++++++++++++++++++++++----
>>   2 files changed, 106 insertions(+), 11 deletions(-)
>>
>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>> index 609c153bae57..fa7a38a30fc6 100644
>> --- a/include/linux/huge_mm.h
>> +++ b/include/linux/huge_mm.h
>> @@ -68,9 +68,11 @@ extern struct kobj_attribute shmem_enabled_attr;
>>   #define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
> 
> [...]
> 
>> +
>> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
>> +static struct folio *alloc_anon_folio(struct vm_fault *vmf)
>> +{
>> +    struct vm_area_struct *vma = vmf->vma;
>> +    unsigned long orders;
>> +    struct folio *folio;
>> +    unsigned long addr;
>> +    pte_t *pte;
>> +    gfp_t gfp;
>> +    int order;
>> +
>> +    /*
>> +     * If uffd is active for the vma we need per-page fault fidelity to
>> +     * maintain the uffd semantics.
>> +     */
>> +    if (unlikely(userfaultfd_armed(vma)))
>> +        goto fallback;
>> +
>> +    /*
>> +     * Get a list of all the (large) orders below PMD_ORDER that are enabled
>> +     * for this vma. Then filter out the orders that can't be allocated over
>> +     * the faulting address and still be fully contained in the vma.
>> +     */
>> +    orders = thp_vma_allowable_orders(vma, vma->vm_flags, false, true, true,
>> +                      BIT(PMD_ORDER) - 1);
>> +    orders = thp_vma_suitable_orders(vma, vmf->address, orders);
>> +
>> +    if (!orders)
>> +        goto fallback;
>> +
>> +    pte = pte_offset_map(vmf->pmd, vmf->address & PMD_MASK);
>> +    if (!pte)
>> +        return ERR_PTR(-EAGAIN);
>> +
>> +    /*
>> +     * Find the highest order where the aligned range is completely
>> +     * pte_none(). Note that all remaining orders will be completely
>> +     * pte_none().
>> +     */
>> +    order = highest_order(orders);
>> +    while (orders) {
>> +        addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
>> +        if (pte_range_none(pte + pte_index(addr), 1 << order))
>> +            break;
>> +        order = next_order(&orders, order);
>> +    }
>> +
>> +    pte_unmap(pte);
>> +
>> +    /* Try allocating the highest of the remaining orders. */
>> +    gfp = vma_thp_gfp_mask(vma);
>> +    while (orders) {
>> +        addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
>> +        folio = vma_alloc_folio(gfp, order, vma, addr, true);
>> +        if (folio) {
>> +            clear_huge_page(&folio->page, vmf->address, 1 << order);
>> +            return folio;
>> +        }
>> +        order = next_order(&orders, order);
>> +    }
>> +
>> +fallback:
>> +    return vma_alloc_zeroed_movable_folio(vma, vmf->address);
>> +}
>> +#else
>> +#define alloc_anon_folio(vmf) \
>> +        vma_alloc_zeroed_movable_folio((vmf)->vma, (vmf)->address)
>> +#endif
> 
> A neater alternative might be
> 
> static struct folio *alloc_anon_folio(struct vm_fault *vmf)
> {
> #ifdef CONFIG_TRANSPARENT_HUGEPAGE
>     /* magic */
> fallback:
> #endif
>     return vma_alloc_zeroed_movable_folio((vmf)->vma, (vmf)->address):
> }

I guess beauty lies in the eye of the beholder... I don't find it much neater
personally :). But happy to make the change if you insist; what's the process
now that its in mm-unstable? Just send a patch to Andrew for squashing?

> 
> [...]
> 
> Acked-by: David Hildenbrand <david@redhat.com>
>
David Hildenbrand Dec. 12, 2023, 4:35 p.m. UTC | #3
On 12.12.23 16:38, Ryan Roberts wrote:
> On 12/12/2023 15:02, David Hildenbrand wrote:
>> On 07.12.23 17:12, Ryan Roberts wrote:
>>> Introduce the logic to allow THP to be configured (through the new sysfs
>>> interface we just added) to allocate large folios to back anonymous
>>> memory, which are larger than the base page size but smaller than
>>> PMD-size. We call this new THP extension "multi-size THP" (mTHP).
>>>
>>> mTHP continues to be PTE-mapped, but in many cases can still provide
>>> similar benefits to traditional PMD-sized THP: Page faults are
>>> significantly reduced (by a factor of e.g. 4, 8, 16, etc. depending on
>>> the configured order), but latency spikes are much less prominent
>>> because the size of each page isn't as huge as the PMD-sized variant and
>>> there is less memory to clear in each page fault. The number of per-page
>>> operations (e.g. ref counting, rmap management, lru list management) are
>>> also significantly reduced since those ops now become per-folio.
>>
>> I'll note that with always-pte-mapped-thp it will be much easier to support
>> incremental page clearing (e.g., zero only parts of the folio and map the
>> remainder in a pro-non-like fashion whereby we'll zero on the next page fault).
>> With a PMD-sized thp, you have to eventually place/rip out page tables to
>> achieve that.
> 
> But then you lose the benefits of reduced number of page faults; reducing page
> faults gives a big speed up for workloads with lots of short lived processes
> like compiling.

Well, you can do interesting things like "allocate order-5", but zero in 
order-3 chunks. You get less page faults and pay for alloc/rmap only once.

But yes, all has pros and cons.

[...]

>>
>>>
>>> Some architectures also employ TLB compression mechanisms to squeeze
>>> more entries in when a set of PTEs are virtually and physically
>>> contiguous and approporiately aligned. In this case, TLB misses will
>>> occur less often.
>>>
>>> The new behaviour is disabled by default, but can be enabled at runtime
>>> by writing to /sys/kernel/mm/transparent_hugepage/hugepage-XXkb/enabled
>>> (see documentation in previous commit). The long term aim is to change
>>> the default to include suitable lower orders, but there are some risks
>>> around internal fragmentation that need to be better understood first.
>>>
>>> Tested-by: Kefeng Wang <wangkefeng.wang@huawei.com>
>>> Tested-by: John Hubbard <jhubbard@nvidia.com>
>>> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
>>> ---
>>>    include/linux/huge_mm.h |   6 ++-
>>>    mm/memory.c             | 111 ++++++++++++++++++++++++++++++++++++----
>>>    2 files changed, 106 insertions(+), 11 deletions(-)
>>>
>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>>> index 609c153bae57..fa7a38a30fc6 100644
>>> --- a/include/linux/huge_mm.h
>>> +++ b/include/linux/huge_mm.h
>>> @@ -68,9 +68,11 @@ extern struct kobj_attribute shmem_enabled_attr;
>>>    #define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
>>
>> [...]
>>
>>> +
>>> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
>>> +static struct folio *alloc_anon_folio(struct vm_fault *vmf)
>>> +{
>>> +    struct vm_area_struct *vma = vmf->vma;
>>> +    unsigned long orders;
>>> +    struct folio *folio;
>>> +    unsigned long addr;
>>> +    pte_t *pte;
>>> +    gfp_t gfp;
>>> +    int order;
>>> +
>>> +    /*
>>> +     * If uffd is active for the vma we need per-page fault fidelity to
>>> +     * maintain the uffd semantics.
>>> +     */
>>> +    if (unlikely(userfaultfd_armed(vma)))
>>> +        goto fallback;
>>> +
>>> +    /*
>>> +     * Get a list of all the (large) orders below PMD_ORDER that are enabled
>>> +     * for this vma. Then filter out the orders that can't be allocated over
>>> +     * the faulting address and still be fully contained in the vma.
>>> +     */
>>> +    orders = thp_vma_allowable_orders(vma, vma->vm_flags, false, true, true,
>>> +                      BIT(PMD_ORDER) - 1);
>>> +    orders = thp_vma_suitable_orders(vma, vmf->address, orders);
>>> +
>>> +    if (!orders)
>>> +        goto fallback;
>>> +
>>> +    pte = pte_offset_map(vmf->pmd, vmf->address & PMD_MASK);
>>> +    if (!pte)
>>> +        return ERR_PTR(-EAGAIN);
>>> +
>>> +    /*
>>> +     * Find the highest order where the aligned range is completely
>>> +     * pte_none(). Note that all remaining orders will be completely
>>> +     * pte_none().
>>> +     */
>>> +    order = highest_order(orders);
>>> +    while (orders) {
>>> +        addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
>>> +        if (pte_range_none(pte + pte_index(addr), 1 << order))
>>> +            break;
>>> +        order = next_order(&orders, order);
>>> +    }
>>> +
>>> +    pte_unmap(pte);
>>> +
>>> +    /* Try allocating the highest of the remaining orders. */
>>> +    gfp = vma_thp_gfp_mask(vma);
>>> +    while (orders) {
>>> +        addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
>>> +        folio = vma_alloc_folio(gfp, order, vma, addr, true);
>>> +        if (folio) {
>>> +            clear_huge_page(&folio->page, vmf->address, 1 << order);
>>> +            return folio;
>>> +        }
>>> +        order = next_order(&orders, order);
>>> +    }
>>> +
>>> +fallback:
>>> +    return vma_alloc_zeroed_movable_folio(vma, vmf->address);
>>> +}
>>> +#else
>>> +#define alloc_anon_folio(vmf) \
>>> +        vma_alloc_zeroed_movable_folio((vmf)->vma, (vmf)->address)
>>> +#endif
>>
>> A neater alternative might be
>>
>> static struct folio *alloc_anon_folio(struct vm_fault *vmf)
>> {
>> #ifdef CONFIG_TRANSPARENT_HUGEPAGE
>>      /* magic */
>> fallback:
>> #endif
>>      return vma_alloc_zeroed_movable_folio((vmf)->vma, (vmf)->address):
>> }
> 
> I guess beauty lies in the eye of the beholder... I don't find it much neater
> personally :). But happy to make the change if you insist; what's the process
> now that its in mm-unstable? Just send a patch to Andrew for squashing?

That way it is clear that the fallback for thp is just what !thp does.

But either is fine for me; no need to change if you disagree.
Dan Carpenter Dec. 13, 2023, 7:21 a.m. UTC | #4
On Thu, Dec 07, 2023 at 04:12:05PM +0000, Ryan Roberts wrote:
> @@ -4176,10 +4260,15 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
>  	/* Allocate our own private page. */
>  	if (unlikely(anon_vma_prepare(vma)))
>  		goto oom;
> -	folio = vma_alloc_zeroed_movable_folio(vma, vmf->address);
> +	folio = alloc_anon_folio(vmf);
> +	if (IS_ERR(folio))
> +		return 0;
>  	if (!folio)
>  		goto oom;

Returning zero is weird.  I think it should be a vm_fault_t code.

This mixing of error pointers and NULL is going to cause problems.
Normally when we have a mix of error pointers and NULL then the NULL is
not an error but instead means that the feature has been deliberately
turned off.  I'm unable to figure out what the meaning is here.

It should return one or the other, or if it's a mix then add a giant
comment explaining what they mean.

regards,
dan carpenter

>  
> +	nr_pages = folio_nr_pages(folio);
> +	addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE);
> +
>  	if (mem_cgroup_charge(folio, vma->vm_mm, GFP_KERNEL))
>  		goto oom_free_page;
>  	folio_throttle_swaprate(folio, GFP_KERNEL);
Ryan Roberts Dec. 14, 2023, 10:54 a.m. UTC | #5
On 13/12/2023 07:21, Dan Carpenter wrote:
> On Thu, Dec 07, 2023 at 04:12:05PM +0000, Ryan Roberts wrote:
>> @@ -4176,10 +4260,15 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
>>  	/* Allocate our own private page. */
>>  	if (unlikely(anon_vma_prepare(vma)))
>>  		goto oom;
>> -	folio = vma_alloc_zeroed_movable_folio(vma, vmf->address);
>> +	folio = alloc_anon_folio(vmf);
>> +	if (IS_ERR(folio))
>> +		return 0;
>>  	if (!folio)
>>  		goto oom;
> 
> Returning zero is weird.  I think it should be a vm_fault_t code.

It's the same pattern that the existing code a little further down this function
already implements:

	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl);
	if (!vmf->pte)
		goto release;

If we fail to map/lock the pte (due to a race), then we return 0 to allow user
space to rerun the faulting instruction and cause the fault to happen again. The
above code ends up calling "return ret;" and ret is 0.

> 
> This mixing of error pointers and NULL is going to cause problems.
> Normally when we have a mix of error pointers and NULL then the NULL is
> not an error but instead means that the feature has been deliberately
> turned off.  I'm unable to figure out what the meaning is here.

There are 3 conditions that the function can return:

 - folio successfully allocated
 - folio failed to be allocated due to OOM
 - fault needs to be tried again due to losing race

Previously only the first 2 conditions were possible and they were indicated by
NULL/not-NULL. The new 3rd condition is only possible when THP is compile-time
enabled. So it keeps the logic simpler to keep the NULL/not-NULL distinction for
the first 2, and use the error code for the final one.

There are IS_ERR() and IS_ERR_OR_NULL() variants so I assume a pattern where you
can have pointer, error or NULL is somewhat common already?

Thanks,
Ryan

> 
> It should return one or the other, or if it's a mix then add a giant
> comment explaining what they mean.
> 
> regards,
> dan carpenter
> 
>>  
>> +	nr_pages = folio_nr_pages(folio);
>> +	addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE);
>> +
>>  	if (mem_cgroup_charge(folio, vma->vm_mm, GFP_KERNEL))
>>  		goto oom_free_page;
>>  	folio_throttle_swaprate(folio, GFP_KERNEL);
>
Dan Carpenter Dec. 14, 2023, 11:30 a.m. UTC | #6
On Thu, Dec 14, 2023 at 10:54:19AM +0000, Ryan Roberts wrote:
> On 13/12/2023 07:21, Dan Carpenter wrote:
> > On Thu, Dec 07, 2023 at 04:12:05PM +0000, Ryan Roberts wrote:
> >> @@ -4176,10 +4260,15 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
> >>  	/* Allocate our own private page. */
> >>  	if (unlikely(anon_vma_prepare(vma)))
> >>  		goto oom;
> >> -	folio = vma_alloc_zeroed_movable_folio(vma, vmf->address);
> >> +	folio = alloc_anon_folio(vmf);
> >> +	if (IS_ERR(folio))
> >> +		return 0;
> >>  	if (!folio)
> >>  		goto oom;
> > 
> > Returning zero is weird.  I think it should be a vm_fault_t code.
> 
> It's the same pattern that the existing code a little further down this function
> already implements:
> 
> 	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl);
> 	if (!vmf->pte)
> 		goto release;
> 
> If we fail to map/lock the pte (due to a race), then we return 0 to allow user
> space to rerun the faulting instruction and cause the fault to happen again. The
> above code ends up calling "return ret;" and ret is 0.
> 

Ah, okay.  Thanks!

> > 
> > This mixing of error pointers and NULL is going to cause problems.
> > Normally when we have a mix of error pointers and NULL then the NULL is
> > not an error but instead means that the feature has been deliberately
> > turned off.  I'm unable to figure out what the meaning is here.
> 
> There are 3 conditions that the function can return:
> 
>  - folio successfully allocated
>  - folio failed to be allocated due to OOM
>  - fault needs to be tried again due to losing race
> 
> Previously only the first 2 conditions were possible and they were indicated by
> NULL/not-NULL. The new 3rd condition is only possible when THP is compile-time
> enabled. So it keeps the logic simpler to keep the NULL/not-NULL distinction for
> the first 2, and use the error code for the final one.
> 
> There are IS_ERR() and IS_ERR_OR_NULL() variants so I assume a pattern where you
> can have pointer, error or NULL is somewhat common already?

People are confused by this a lot so I have written a blog about it:

https://staticthinking.wordpress.com/2022/08/01/mixing-error-pointers-and-null/

The IS_ERR_OR_NULL() function should be used like this:

int blink_leds()
{
	led = get_leds();
	if (IS_ERR_OR_NULL(led))
		return PTR_ERR(led);  <-- NULL means zero/success
	return led->blink();
}

In the case of alloc_anon_folio(), I would be tempted to create a
wrapper around it where NULL becomes ERR_PTR(-ENOMEM).  But this is
obviously fast path code and I haven't benchmarked it.

Adding a comment is the other option.

regards,
dan carpenter
Ryan Roberts Dec. 14, 2023, 12:12 p.m. UTC | #7
On 14/12/2023 11:30, Dan Carpenter wrote:
> On Thu, Dec 14, 2023 at 10:54:19AM +0000, Ryan Roberts wrote:
>> On 13/12/2023 07:21, Dan Carpenter wrote:
>>> On Thu, Dec 07, 2023 at 04:12:05PM +0000, Ryan Roberts wrote:
>>>> @@ -4176,10 +4260,15 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
>>>>  	/* Allocate our own private page. */
>>>>  	if (unlikely(anon_vma_prepare(vma)))
>>>>  		goto oom;
>>>> -	folio = vma_alloc_zeroed_movable_folio(vma, vmf->address);
>>>> +	folio = alloc_anon_folio(vmf);
>>>> +	if (IS_ERR(folio))
>>>> +		return 0;
>>>>  	if (!folio)
>>>>  		goto oom;
>>>
>>> Returning zero is weird.  I think it should be a vm_fault_t code.
>>
>> It's the same pattern that the existing code a little further down this function
>> already implements:
>>
>> 	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl);
>> 	if (!vmf->pte)
>> 		goto release;
>>
>> If we fail to map/lock the pte (due to a race), then we return 0 to allow user
>> space to rerun the faulting instruction and cause the fault to happen again. The
>> above code ends up calling "return ret;" and ret is 0.
>>
> 
> Ah, okay.  Thanks!
> 
>>>
>>> This mixing of error pointers and NULL is going to cause problems.
>>> Normally when we have a mix of error pointers and NULL then the NULL is
>>> not an error but instead means that the feature has been deliberately
>>> turned off.  I'm unable to figure out what the meaning is here.
>>
>> There are 3 conditions that the function can return:
>>
>>  - folio successfully allocated
>>  - folio failed to be allocated due to OOM
>>  - fault needs to be tried again due to losing race
>>
>> Previously only the first 2 conditions were possible and they were indicated by
>> NULL/not-NULL. The new 3rd condition is only possible when THP is compile-time
>> enabled. So it keeps the logic simpler to keep the NULL/not-NULL distinction for
>> the first 2, and use the error code for the final one.
>>
>> There are IS_ERR() and IS_ERR_OR_NULL() variants so I assume a pattern where you
>> can have pointer, error or NULL is somewhat common already?
> 
> People are confused by this a lot so I have written a blog about it:
> 
> https://staticthinking.wordpress.com/2022/08/01/mixing-error-pointers-and-null/

Nice; thanks for the pointer :)

> 
> The IS_ERR_OR_NULL() function should be used like this:
> 
> int blink_leds()
> {
> 	led = get_leds();
> 	if (IS_ERR_OR_NULL(led))
> 		return PTR_ERR(led);  <-- NULL means zero/success
> 	return led->blink();
> }
> 
> In the case of alloc_anon_folio(), I would be tempted to create a
> wrapper around it where NULL becomes ERR_PTR(-ENOMEM).  But this is
> obviously fast path code and I haven't benchmarked it.
> 
> Adding a comment is the other option.

I'll add a comment; as you say this is a fast path, and I'm actively being
burned in similar places (on another series I'm working on) where an additional
check is regressing performance significantly so not keen on risking it here.

Andrew, I'll fold in the David's suggested ifdef improvement at the same time.
Would you prefer an additional patch to squash in, or a whole new version of the
series to swap out with the existing patches in mm-unstable?

> 
> regards,
> dan carpenter
>
diff mbox series

Patch

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 609c153bae57..fa7a38a30fc6 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -68,9 +68,11 @@  extern struct kobj_attribute shmem_enabled_attr;
 #define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
 
 /*
- * Mask of all large folio orders supported for anonymous THP.
+ * Mask of all large folio orders supported for anonymous THP; all orders up to
+ * and including PMD_ORDER, except order-0 (which is not "huge") and order-1
+ * (which is a limitation of the THP implementation).
  */
-#define THP_ORDERS_ALL_ANON	BIT(PMD_ORDER)
+#define THP_ORDERS_ALL_ANON	((BIT(PMD_ORDER + 1) - 1) & ~(BIT(0) | BIT(1)))
 
 /*
  * Mask of all large folio orders supported for file THP.
diff --git a/mm/memory.c b/mm/memory.c
index 8ab2d994d997..8f0b936b90b5 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4125,6 +4125,87 @@  vm_fault_t do_swap_page(struct vm_fault *vmf)
 	return ret;
 }
 
+static bool pte_range_none(pte_t *pte, int nr_pages)
+{
+	int i;
+
+	for (i = 0; i < nr_pages; i++) {
+		if (!pte_none(ptep_get_lockless(pte + i)))
+			return false;
+	}
+
+	return true;
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static struct folio *alloc_anon_folio(struct vm_fault *vmf)
+{
+	struct vm_area_struct *vma = vmf->vma;
+	unsigned long orders;
+	struct folio *folio;
+	unsigned long addr;
+	pte_t *pte;
+	gfp_t gfp;
+	int order;
+
+	/*
+	 * If uffd is active for the vma we need per-page fault fidelity to
+	 * maintain the uffd semantics.
+	 */
+	if (unlikely(userfaultfd_armed(vma)))
+		goto fallback;
+
+	/*
+	 * Get a list of all the (large) orders below PMD_ORDER that are enabled
+	 * for this vma. Then filter out the orders that can't be allocated over
+	 * the faulting address and still be fully contained in the vma.
+	 */
+	orders = thp_vma_allowable_orders(vma, vma->vm_flags, false, true, true,
+					  BIT(PMD_ORDER) - 1);
+	orders = thp_vma_suitable_orders(vma, vmf->address, orders);
+
+	if (!orders)
+		goto fallback;
+
+	pte = pte_offset_map(vmf->pmd, vmf->address & PMD_MASK);
+	if (!pte)
+		return ERR_PTR(-EAGAIN);
+
+	/*
+	 * Find the highest order where the aligned range is completely
+	 * pte_none(). Note that all remaining orders will be completely
+	 * pte_none().
+	 */
+	order = highest_order(orders);
+	while (orders) {
+		addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
+		if (pte_range_none(pte + pte_index(addr), 1 << order))
+			break;
+		order = next_order(&orders, order);
+	}
+
+	pte_unmap(pte);
+
+	/* Try allocating the highest of the remaining orders. */
+	gfp = vma_thp_gfp_mask(vma);
+	while (orders) {
+		addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
+		folio = vma_alloc_folio(gfp, order, vma, addr, true);
+		if (folio) {
+			clear_huge_page(&folio->page, vmf->address, 1 << order);
+			return folio;
+		}
+		order = next_order(&orders, order);
+	}
+
+fallback:
+	return vma_alloc_zeroed_movable_folio(vma, vmf->address);
+}
+#else
+#define alloc_anon_folio(vmf) \
+		vma_alloc_zeroed_movable_folio((vmf)->vma, (vmf)->address)
+#endif
+
 /*
  * We enter with non-exclusive mmap_lock (to exclude vma changes,
  * but allow concurrent faults), and pte mapped but not yet locked.
@@ -4134,9 +4215,12 @@  static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
 {
 	bool uffd_wp = vmf_orig_pte_uffd_wp(vmf);
 	struct vm_area_struct *vma = vmf->vma;
+	unsigned long addr = vmf->address;
 	struct folio *folio;
 	vm_fault_t ret = 0;
+	int nr_pages = 1;
 	pte_t entry;
+	int i;
 
 	/* File mapping without ->vm_ops ? */
 	if (vma->vm_flags & VM_SHARED)
@@ -4176,10 +4260,15 @@  static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
 	/* Allocate our own private page. */
 	if (unlikely(anon_vma_prepare(vma)))
 		goto oom;
-	folio = vma_alloc_zeroed_movable_folio(vma, vmf->address);
+	folio = alloc_anon_folio(vmf);
+	if (IS_ERR(folio))
+		return 0;
 	if (!folio)
 		goto oom;
 
+	nr_pages = folio_nr_pages(folio);
+	addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE);
+
 	if (mem_cgroup_charge(folio, vma->vm_mm, GFP_KERNEL))
 		goto oom_free_page;
 	folio_throttle_swaprate(folio, GFP_KERNEL);
@@ -4196,12 +4285,15 @@  static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
 	if (vma->vm_flags & VM_WRITE)
 		entry = pte_mkwrite(pte_mkdirty(entry), vma);
 
-	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
-			&vmf->ptl);
+	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl);
 	if (!vmf->pte)
 		goto release;
-	if (vmf_pte_changed(vmf)) {
-		update_mmu_tlb(vma, vmf->address, vmf->pte);
+	if (nr_pages == 1 && vmf_pte_changed(vmf)) {
+		update_mmu_tlb(vma, addr, vmf->pte);
+		goto release;
+	} else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) {
+		for (i = 0; i < nr_pages; i++)
+			update_mmu_tlb(vma, addr + PAGE_SIZE * i, vmf->pte + i);
 		goto release;
 	}
 
@@ -4216,16 +4308,17 @@  static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
 		return handle_userfault(vmf, VM_UFFD_MISSING);
 	}
 
-	inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
-	folio_add_new_anon_rmap(folio, vma, vmf->address);
+	folio_ref_add(folio, nr_pages - 1);
+	add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages);
+	folio_add_new_anon_rmap(folio, vma, addr);
 	folio_add_lru_vma(folio, vma);
 setpte:
 	if (uffd_wp)
 		entry = pte_mkuffd_wp(entry);
-	set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
+	set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr_pages);
 
 	/* No need to invalidate - it was non-present before */
-	update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);
+	update_mmu_cache_range(vmf, vma, addr, vmf->pte, nr_pages);
 unlock:
 	if (vmf->pte)
 		pte_unmap_unlock(vmf->pte, vmf->ptl);