diff mbox series

[RFC,v4,3/4] mm: add do_set_pte_range()

Message ID 20230206140639.538867-4-fengwei.yin@intel.com (mailing list archive)
State New
Headers show
Series folio based filemap_map_pages() | expand

Commit Message

Yin Fengwei Feb. 6, 2023, 2:06 p.m. UTC
do_set_pte_range() allows to setup page table entries for a
specific range. It calls folio_add_file_rmap_range() to take
advantage of batched rmap update for large folio.

Signed-off-by: Yin Fengwei <fengwei.yin@intel.com>
---
 include/linux/mm.h |  3 +++
 mm/filemap.c       |  1 -
 mm/memory.c        | 66 ++++++++++++++++++++++++++++++++--------------
 3 files changed, 49 insertions(+), 21 deletions(-)

Comments

Matthew Wilcox Feb. 6, 2023, 2:44 p.m. UTC | #1
On Mon, Feb 06, 2023 at 10:06:38PM +0800, Yin Fengwei wrote:
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index d6f8f41514cc..93192f04b276 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1162,6 +1162,9 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
>  
>  vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page);
>  void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr);
> +void do_set_pte_range(struct vm_fault *vmf, struct folio *folio,
> +		unsigned long addr, pte_t *pte,
> +		unsigned long start, unsigned int nr);

There are only two callers of do_set_pte(), and they're both in mm.
I don't think we should retain do_set_pte() as a wrapper, but rather
change both callers to call 'set_pte_range()'.  The 'do' doesn't add
any value, so let's drop that word.

> +	if (!cow) {
> +		folio_add_file_rmap_range(folio, start, nr, vma, false);
> +		add_mm_counter(vma->vm_mm, mm_counter_file(page), nr);
> +	} else {
> +		/*
> +		 * rmap code is not ready to handle COW with anonymous
> +		 * large folio yet. Capture and warn if large folio
> +		 * is given.
> +		 */
> +		VM_WARN_ON_FOLIO(folio_test_large(folio), folio);
> +	}

The handling of cow pages is still very clunky.
folio_add_new_anon_rmap() handles anonymous large folios just fine.  I
think David was looking at current code, not the code in mm-next.

> +		set_pte_at(vma->vm_mm, addr, pte, entry);
> +
> +		/* no need to invalidate: a not-present page won't be cached */
> +		update_mmu_cache(vma, addr, pte);
> +	} while (pte++, page++, addr += PAGE_SIZE, --nr > 0);

There's no need to speed-run this.  Let's do it properly and get the
arch interface right.  This code isn't going to hit linux-next for four
more weeks, which is plenty of time.
Yin Fengwei Feb. 6, 2023, 2:58 p.m. UTC | #2
On 2/6/2023 10:44 PM, Matthew Wilcox wrote:
> On Mon, Feb 06, 2023 at 10:06:38PM +0800, Yin Fengwei wrote:
>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>> index d6f8f41514cc..93192f04b276 100644
>> --- a/include/linux/mm.h
>> +++ b/include/linux/mm.h
>> @@ -1162,6 +1162,9 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
>>  
>>  vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page);
>>  void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr);
>> +void do_set_pte_range(struct vm_fault *vmf, struct folio *folio,
>> +		unsigned long addr, pte_t *pte,
>> +		unsigned long start, unsigned int nr);
> 
> There are only two callers of do_set_pte(), and they're both in mm.
> I don't think we should retain do_set_pte() as a wrapper, but rather
> change both callers to call 'set_pte_range()'.  The 'do' doesn't add
> any value, so let's drop that word.
OK.

> 
>> +	if (!cow) {
>> +		folio_add_file_rmap_range(folio, start, nr, vma, false);
>> +		add_mm_counter(vma->vm_mm, mm_counter_file(page), nr);
>> +	} else {
>> +		/*
>> +		 * rmap code is not ready to handle COW with anonymous
>> +		 * large folio yet. Capture and warn if large folio
>> +		 * is given.
>> +		 */
>> +		VM_WARN_ON_FOLIO(folio_test_large(folio), folio);
>> +	}
> 
> The handling of cow pages is still very clunky.
> folio_add_new_anon_rmap() handles anonymous large folios just fine.  I
> think David was looking at current code, not the code in mm-next.
OK. Let's wait for further comment from David.

> 
>> +		set_pte_at(vma->vm_mm, addr, pte, entry);
>> +
>> +		/* no need to invalidate: a not-present page won't be cached */
>> +		update_mmu_cache(vma, addr, pte);
>> +	} while (pte++, page++, addr += PAGE_SIZE, --nr > 0);
> 
> There's no need to speed-run this.  Let's do it properly and get the
Sorry. I don't get your point here. Do you mean the pte_next()? Or other
things?


Regards
Yin, Fengwei

> arch interface right.  This code isn't going to hit linux-next for four
> more weeks, which is plenty of time.
David Hildenbrand Feb. 6, 2023, 3:13 p.m. UTC | #3
On 06.02.23 15:58, Yin, Fengwei wrote:
> 
> 
> On 2/6/2023 10:44 PM, Matthew Wilcox wrote:
>> On Mon, Feb 06, 2023 at 10:06:38PM +0800, Yin Fengwei wrote:
>>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>>> index d6f8f41514cc..93192f04b276 100644
>>> --- a/include/linux/mm.h
>>> +++ b/include/linux/mm.h
>>> @@ -1162,6 +1162,9 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
>>>   
>>>   vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page);
>>>   void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr);
>>> +void do_set_pte_range(struct vm_fault *vmf, struct folio *folio,
>>> +		unsigned long addr, pte_t *pte,
>>> +		unsigned long start, unsigned int nr);
>>
>> There are only two callers of do_set_pte(), and they're both in mm.
>> I don't think we should retain do_set_pte() as a wrapper, but rather
>> change both callers to call 'set_pte_range()'.  The 'do' doesn't add
>> any value, so let's drop that word.
> OK.
> 
>>
>>> +	if (!cow) {
>>> +		folio_add_file_rmap_range(folio, start, nr, vma, false);
>>> +		add_mm_counter(vma->vm_mm, mm_counter_file(page), nr);
>>> +	} else {
>>> +		/*
>>> +		 * rmap code is not ready to handle COW with anonymous
>>> +		 * large folio yet. Capture and warn if large folio
>>> +		 * is given.
>>> +		 */
>>> +		VM_WARN_ON_FOLIO(folio_test_large(folio), folio);
>>> +	}
>>
>> The handling of cow pages is still very clunky.
>> folio_add_new_anon_rmap() handles anonymous large folios just fine.  I
>> think David was looking at current code, not the code in mm-next.
> OK. Let's wait for further comment from David.

As I raised, page_add_new_anon_rmap() -> folio_add_new_anon_rmap() can 
be used to add a fresh (a) PMD-mapped THP or (b) order-0 folio.

folio_add_new_anon_rmap() is not suitable for PTE-mapping a large folio. 
Which is what we are intending to do here unless I am completely off.

PTE-mapping a large folio requires different accounting, different 
mapcount handling and different PG_anon_exclusive handling.

Which is all not there yet.
Matthew Wilcox Feb. 6, 2023, 4:33 p.m. UTC | #4
On Mon, Feb 06, 2023 at 04:13:44PM +0100, David Hildenbrand wrote:
> > > The handling of cow pages is still very clunky.
> > > folio_add_new_anon_rmap() handles anonymous large folios just fine.  I
> > > think David was looking at current code, not the code in mm-next.
> > OK. Let's wait for further comment from David.
> 
> As I raised, page_add_new_anon_rmap() -> folio_add_new_anon_rmap() can be
> used to add a fresh (a) PMD-mapped THP or (b) order-0 folio.
>
> folio_add_new_anon_rmap() is not suitable for PTE-mapping a large folio.
> Which is what we are intending to do here unless I am completely off.

I think you are.  While the infrastructure here handles large folios
which are not PMDs, there's nobody who will allocate such a thing, so
there is no problem.  Right>

> PTE-mapping a large folio requires different accounting, different mapcount
> handling and different PG_anon_exclusive handling.
> 
> Which is all not there yet.

Assuming you mean "a large folio which is not PMD sized", I agree.  But we
haven't even started discussing how we should decide to allocate folios
which are not 0 or PMD order yet, so this worry seems premature.  When we
have figured that out, it'll be time to look at folio_add_new_anon_rmap()
to decide how to support it, but hopefully before then we'll have a
better way of handling mapcount.
David Hildenbrand Feb. 6, 2023, 4:35 p.m. UTC | #5
On 06.02.23 17:33, Matthew Wilcox wrote:
> On Mon, Feb 06, 2023 at 04:13:44PM +0100, David Hildenbrand wrote:
>>>> The handling of cow pages is still very clunky.
>>>> folio_add_new_anon_rmap() handles anonymous large folios just fine.  I
>>>> think David was looking at current code, not the code in mm-next.
>>> OK. Let's wait for further comment from David.
>>
>> As I raised, page_add_new_anon_rmap() -> folio_add_new_anon_rmap() can be
>> used to add a fresh (a) PMD-mapped THP or (b) order-0 folio.
>>
>> folio_add_new_anon_rmap() is not suitable for PTE-mapping a large folio.
>> Which is what we are intending to do here unless I am completely off.
> 
> I think you are.  While the infrastructure here handles large folios
> which are not PMDs, there's nobody who will allocate such a thing, so
> there is no problem.  Right>

And that's precisely what I want to have fenced off here. I want that 
function to complain instead of silently doing the wrong thing.
Matthew Wilcox Feb. 6, 2023, 4:43 p.m. UTC | #6
On Mon, Feb 06, 2023 at 05:35:55PM +0100, David Hildenbrand wrote:
> On 06.02.23 17:33, Matthew Wilcox wrote:
> > On Mon, Feb 06, 2023 at 04:13:44PM +0100, David Hildenbrand wrote:
> > > > > The handling of cow pages is still very clunky.
> > > > > folio_add_new_anon_rmap() handles anonymous large folios just fine.  I
> > > > > think David was looking at current code, not the code in mm-next.
> > > > OK. Let's wait for further comment from David.
> > > 
> > > As I raised, page_add_new_anon_rmap() -> folio_add_new_anon_rmap() can be
> > > used to add a fresh (a) PMD-mapped THP or (b) order-0 folio.
> > > 
> > > folio_add_new_anon_rmap() is not suitable for PTE-mapping a large folio.
> > > Which is what we are intending to do here unless I am completely off.
> > 
> > I think you are.  While the infrastructure here handles large folios
> > which are not PMDs, there's nobody who will allocate such a thing, so
> > there is no problem.  Right>
> 
> And that's precisely what I want to have fenced off here. I want that
> function to complain instead of silently doing the wrong thing.

If this were a widely called function, I'd agree.  But there are two
callers of do_set_pte; one in filemap.c and one in memory.c.  It's
overcautious and has created huge churn in this patchset.  If you're
really that concerned, stick a VM_BUG_ON() in folio_add_new_anon_rmap()
instead of making weird stuff happen in set_pte_range().
David Hildenbrand Feb. 6, 2023, 4:49 p.m. UTC | #7
On 06.02.23 17:43, Matthew Wilcox wrote:
> On Mon, Feb 06, 2023 at 05:35:55PM +0100, David Hildenbrand wrote:
>> On 06.02.23 17:33, Matthew Wilcox wrote:
>>> On Mon, Feb 06, 2023 at 04:13:44PM +0100, David Hildenbrand wrote:
>>>>>> The handling of cow pages is still very clunky.
>>>>>> folio_add_new_anon_rmap() handles anonymous large folios just fine.  I
>>>>>> think David was looking at current code, not the code in mm-next.
>>>>> OK. Let's wait for further comment from David.
>>>>
>>>> As I raised, page_add_new_anon_rmap() -> folio_add_new_anon_rmap() can be
>>>> used to add a fresh (a) PMD-mapped THP or (b) order-0 folio.
>>>>
>>>> folio_add_new_anon_rmap() is not suitable for PTE-mapping a large folio.
>>>> Which is what we are intending to do here unless I am completely off.
>>>
>>> I think you are.  While the infrastructure here handles large folios
>>> which are not PMDs, there's nobody who will allocate such a thing, so
>>> there is no problem.  Right>
>>
>> And that's precisely what I want to have fenced off here. I want that
>> function to complain instead of silently doing the wrong thing.
> 
> If this were a widely called function, I'd agree.  But there are two
> callers of do_set_pte; one in filemap.c and one in memory.c.  It's
> overcautious and has created huge churn in this patchset.  If you're
> really that concerned, stick a VM_BUG_ON() in folio_add_new_anon_rmap()
> instead of making weird stuff happen in set_pte_range().

We have

+	if (!cow) {
+		folio_add_file_rmap_range(folio, start, nr, vma, false);
+		add_mm_counter(vma->vm_mm, mm_counter_file(page), nr);
+	} else {
+		/*
+		 * rmap code is not ready to handle COW with anonymous
+		 * large folio yet. Capture and warn if large folio
+		 * is given.
+		 */
+		VM_WARN_ON_FOLIO(folio_test_large(folio), folio);
+	}

now.

What are we supposed to add instead on the else branch instead that 
would be correct in the future? Or not look weird?

Go on, scream louder at me, I don't care.
Matthew Wilcox Feb. 6, 2023, 5:10 p.m. UTC | #8
On Mon, Feb 06, 2023 at 05:49:20PM +0100, David Hildenbrand wrote:
> We have
> 
> +	if (!cow) {
> +		folio_add_file_rmap_range(folio, start, nr, vma, false);
> +		add_mm_counter(vma->vm_mm, mm_counter_file(page), nr);
> +	} else {
> +		/*
> +		 * rmap code is not ready to handle COW with anonymous
> +		 * large folio yet. Capture and warn if large folio
> +		 * is given.
> +		 */
> +		VM_WARN_ON_FOLIO(folio_test_large(folio), folio);
> +	}
> 
> now.
> 
> What are we supposed to add instead on the else branch instead that would be
> correct in the future? Or not look weird?

Right now, I think this patch should look something like this.

diff --git a/mm/memory.c b/mm/memory.c
index 7a04a1130ec1..2f6173f83d8b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4257,15 +4257,18 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
 }
 #endif
 
-void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
+void set_pte_range(struct vm_fault *vmf, struct folio *folio,
+		struct page *page, unsigned int nr, unsigned long addr)
 {
 	struct vm_area_struct *vma = vmf->vma;
 	bool uffd_wp = pte_marker_uffd_wp(vmf->orig_pte);
 	bool write = vmf->flags & FAULT_FLAG_WRITE;
 	bool prefault = vmf->address != addr;
 	pte_t entry;
+	unsigned int i;
 
-	flush_icache_page(vma, page);
+	for (i = 0; i < nr; i++)
+		flush_icache_page(vma, page + i);
 	entry = mk_pte(page, vma->vm_page_prot);
 
 	if (prefault && arch_wants_old_prefaulted_pte())
@@ -4279,14 +4282,15 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
 		entry = pte_mkuffd_wp(entry);
 	/* copy-on-write page */
 	if (write && !(vma->vm_flags & VM_SHARED)) {
-		inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
-		page_add_new_anon_rmap(page, vma, addr);
-		lru_cache_add_inactive_or_unevictable(page, vma);
+		add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr);
+		VM_BUG_ON_FOLIO(folio, nr != 1);
+		folio_add_new_anon_rmap(folio, vma, addr);
+		folio_add_lru_vma(folio, vma);
 	} else {
-		inc_mm_counter(vma->vm_mm, mm_counter_file(page));
-		page_add_file_rmap(page, vma, false);
+		add_mm_counter(vma->vm_mm, mm_counter_file(page), nr);
+		folio_add_file_rmap_range(folio, page, nr, vma, false);
 	}
-	set_pte_at(vma->vm_mm, addr, vmf->pte, entry);
+	set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr);
 }
 
 static bool vmf_pte_changed(struct vm_fault *vmf)
@@ -4359,7 +4363,9 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
 
 	/* Re-check under ptl */
 	if (likely(!vmf_pte_changed(vmf))) {
-		do_set_pte(vmf, page, vmf->address);
+		struct folio *folio = page_folio(page);
+
+		set_pte_range(vmf, folio, page, 1, vmf->address);
 
 		/* no need to invalidate: a not-present page won't be cached */
 		update_mmu_cache(vma, vmf->address, vmf->pte);

> Go on, scream louder at me, I don't care.

I'm not even shouting.  I just think you're wrong ;-)
David Hildenbrand Feb. 6, 2023, 5:35 p.m. UTC | #9
On 06.02.23 18:10, Matthew Wilcox wrote:
> On Mon, Feb 06, 2023 at 05:49:20PM +0100, David Hildenbrand wrote:
>> We have
>>
>> +	if (!cow) {
>> +		folio_add_file_rmap_range(folio, start, nr, vma, false);
>> +		add_mm_counter(vma->vm_mm, mm_counter_file(page), nr);
>> +	} else {
>> +		/*
>> +		 * rmap code is not ready to handle COW with anonymous
>> +		 * large folio yet. Capture and warn if large folio
>> +		 * is given.
>> +		 */
>> +		VM_WARN_ON_FOLIO(folio_test_large(folio), folio);
>> +	}
>>
>> now.
>>
>> What are we supposed to add instead on the else branch instead that would be
>> correct in the future? Or not look weird?
> 
> Right now, I think this patch should look something like this.
> 
> diff --git a/mm/memory.c b/mm/memory.c
> index 7a04a1130ec1..2f6173f83d8b 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -4257,15 +4257,18 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
>   }
>   #endif
>   
> -void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
> +void set_pte_range(struct vm_fault *vmf, struct folio *folio,
> +		struct page *page, unsigned int nr, unsigned long addr)
>   {
>   	struct vm_area_struct *vma = vmf->vma;
>   	bool uffd_wp = pte_marker_uffd_wp(vmf->orig_pte);
>   	bool write = vmf->flags & FAULT_FLAG_WRITE;
>   	bool prefault = vmf->address != addr;
>   	pte_t entry;
> +	unsigned int i;
>   
> -	flush_icache_page(vma, page);
> +	for (i = 0; i < nr; i++)
> +		flush_icache_page(vma, page + i);
>   	entry = mk_pte(page, vma->vm_page_prot);
>   
>   	if (prefault && arch_wants_old_prefaulted_pte())
> @@ -4279,14 +4282,15 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
>   		entry = pte_mkuffd_wp(entry);
>   	/* copy-on-write page */
>   	if (write && !(vma->vm_flags & VM_SHARED)) {
> -		inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
> -		page_add_new_anon_rmap(page, vma, addr);
> -		lru_cache_add_inactive_or_unevictable(page, vma);
> +		add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr);
> +		VM_BUG_ON_FOLIO(folio, nr != 1);

^ what I asked for (WARN would be sufficient for IMHO). I don't 
precisely care how precisely we tell the educated reader that this 
function only handles this special case (I could even be convinced that 
a comment is good enough ;) ).

> +		folio_add_new_anon_rmap(folio, vma, addr);
> +		folio_add_lru_vma(folio, vma);
>   	} else {
> -		inc_mm_counter(vma->vm_mm, mm_counter_file(page));
> -		page_add_file_rmap(page, vma, false);
> +		add_mm_counter(vma->vm_mm, mm_counter_file(page), nr);
> +		folio_add_file_rmap_range(folio, page, nr, vma, false);
>   	}
> -	set_pte_at(vma->vm_mm, addr, vmf->pte, entry);
> +	set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr);
>   }
>   
>   static bool vmf_pte_changed(struct vm_fault *vmf)
> @@ -4359,7 +4363,9 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
>   
>   	/* Re-check under ptl */
>   	if (likely(!vmf_pte_changed(vmf))) {
> -		do_set_pte(vmf, page, vmf->address);
> +		struct folio *folio = page_folio(page);
> +
> +		set_pte_range(vmf, folio, page, 1, vmf->address);
>   
>   		/* no need to invalidate: a not-present page won't be cached */
>   		update_mmu_cache(vma, vmf->address, vmf->pte);
> 
>> Go on, scream louder at me, I don't care.
> 
> I'm not even shouting.  I just think you're wrong ;-)
> 

Good ;)
Yin Fengwei Feb. 7, 2023, 6:05 a.m. UTC | #10
On 2/7/2023 1:35 AM, David Hildenbrand wrote:
> On 06.02.23 18:10, Matthew Wilcox wrote:
>> On Mon, Feb 06, 2023 at 05:49:20PM +0100, David Hildenbrand wrote:
>>> We have
>>>
>>> +    if (!cow) {
>>> +        folio_add_file_rmap_range(folio, start, nr, vma, false);
>>> +        add_mm_counter(vma->vm_mm, mm_counter_file(page), nr);
>>> +    } else {
>>> +        /*
>>> +         * rmap code is not ready to handle COW with anonymous
>>> +         * large folio yet. Capture and warn if large folio
>>> +         * is given.
>>> +         */
>>> +        VM_WARN_ON_FOLIO(folio_test_large(folio), folio);
>>> +    }
>>>
>>> now.
>>>
>>> What are we supposed to add instead on the else branch instead that would be
>>> correct in the future? Or not look weird?
>>
>> Right now, I think this patch should look something like this.
>>
>> diff --git a/mm/memory.c b/mm/memory.c
>> index 7a04a1130ec1..2f6173f83d8b 100644
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -4257,15 +4257,18 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
>>   }
>>   #endif
>>   -void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
>> +void set_pte_range(struct vm_fault *vmf, struct folio *folio,
>> +        struct page *page, unsigned int nr, unsigned long addr)
>>   {
>>       struct vm_area_struct *vma = vmf->vma;
>>       bool uffd_wp = pte_marker_uffd_wp(vmf->orig_pte);
>>       bool write = vmf->flags & FAULT_FLAG_WRITE;
>>       bool prefault = vmf->address != addr;
>>       pte_t entry;
>> +    unsigned int i;
>>   -    flush_icache_page(vma, page);
>> +    for (i = 0; i < nr; i++)
>> +        flush_icache_page(vma, page + i);
>>       entry = mk_pte(page, vma->vm_page_prot);
>>         if (prefault && arch_wants_old_prefaulted_pte())
>> @@ -4279,14 +4282,15 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
>>           entry = pte_mkuffd_wp(entry);
>>       /* copy-on-write page */
>>       if (write && !(vma->vm_flags & VM_SHARED)) {
>> -        inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
>> -        page_add_new_anon_rmap(page, vma, addr);
>> -        lru_cache_add_inactive_or_unevictable(page, vma);
>> +        add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr);
>> +        VM_BUG_ON_FOLIO(folio, nr != 1);
> 
> ^ what I asked for (WARN would be sufficient for IMHO). I don't precisely care how precisely we tell the educated reader that this function only handles this special case (I could even be convinced that a comment is good enough ;) ).
David, Thanks. I am going to move the cow and !cow case to
set_pte_range() and WARN if the large folio is passed in.

> 
>> +        folio_add_new_anon_rmap(folio, vma, addr);
>> +        folio_add_lru_vma(folio, vma);
>>       } else {
>> -        inc_mm_counter(vma->vm_mm, mm_counter_file(page));
>> -        page_add_file_rmap(page, vma, false);
>> +        add_mm_counter(vma->vm_mm, mm_counter_file(page), nr);
>> +        folio_add_file_rmap_range(folio, page, nr, vma, false);
>>       }
>> -    set_pte_at(vma->vm_mm, addr, vmf->pte, entry);
>> +    set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr);
Matthew, I think we can't do this set_ptes() until pte_next() is ready.
I will make it still a loop to generate correct entry. And we can replace
the loop with set_ptes() once the pte_next() is ready.

Let me know if I get something wrong. Thanks.

Regards
Yin, Fengwei

>>   }
>>     static bool vmf_pte_changed(struct vm_fault *vmf)
>> @@ -4359,7 +4363,9 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
>>         /* Re-check under ptl */
>>       if (likely(!vmf_pte_changed(vmf))) {
>> -        do_set_pte(vmf, page, vmf->address);
>> +        struct folio *folio = page_folio(page);
>> +
>> +        set_pte_range(vmf, folio, page, 1, vmf->address);
>>             /* no need to invalidate: a not-present page won't be cached */
>>           update_mmu_cache(vma, vmf->address, vmf->pte);
>>
>>> Go on, scream louder at me, I don't care.
>>
>> I'm not even shouting.  I just think you're wrong ;-)
>>
> 
> Good ;)
>
diff mbox series

Patch

diff --git a/include/linux/mm.h b/include/linux/mm.h
index d6f8f41514cc..93192f04b276 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1162,6 +1162,9 @@  static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
 
 vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page);
 void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr);
+void do_set_pte_range(struct vm_fault *vmf, struct folio *folio,
+		unsigned long addr, pte_t *pte,
+		unsigned long start, unsigned int nr);
 
 vm_fault_t finish_fault(struct vm_fault *vmf);
 vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf);
diff --git a/mm/filemap.c b/mm/filemap.c
index 1c37376fc8d5..6f110b9e5d27 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -3376,7 +3376,6 @@  static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf,
 
 		ref_count++;
 		do_set_pte(vmf, page, addr);
-		update_mmu_cache(vma, addr, vmf->pte);
 	} while (vmf->pte++, page++, addr += PAGE_SIZE, ++count < nr_pages);
 
 	/* Restore the vmf->pte */
diff --git a/mm/memory.c b/mm/memory.c
index 7a04a1130ec1..51f8bd91d9f0 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4257,36 +4257,65 @@  vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
 }
 #endif
 
-void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
+void do_set_pte_range(struct vm_fault *vmf, struct folio *folio,
+		unsigned long addr, pte_t *pte,
+		unsigned long start, unsigned int nr)
 {
 	struct vm_area_struct *vma = vmf->vma;
 	bool uffd_wp = pte_marker_uffd_wp(vmf->orig_pte);
 	bool write = vmf->flags & FAULT_FLAG_WRITE;
+	bool cow = write && !(vma->vm_flags & VM_SHARED);
 	bool prefault = vmf->address != addr;
+	struct page *page = folio_page(folio, start);
 	pte_t entry;
 
-	flush_icache_page(vma, page);
-	entry = mk_pte(page, vma->vm_page_prot);
+	if (!cow) {
+		folio_add_file_rmap_range(folio, start, nr, vma, false);
+		add_mm_counter(vma->vm_mm, mm_counter_file(page), nr);
+	} else {
+		/*
+		 * rmap code is not ready to handle COW with anonymous
+		 * large folio yet. Capture and warn if large folio
+		 * is given.
+		 */
+		VM_WARN_ON_FOLIO(folio_test_large(folio), folio);
+	}
 
-	if (prefault && arch_wants_old_prefaulted_pte())
-		entry = pte_mkold(entry);
-	else
-		entry = pte_sw_mkyoung(entry);
+	do {
+		flush_icache_page(vma, page);
+		entry = mk_pte(page, vma->vm_page_prot);
 
-	if (write)
-		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-	if (unlikely(uffd_wp))
-		entry = pte_mkuffd_wp(entry);
-	/* copy-on-write page */
-	if (write && !(vma->vm_flags & VM_SHARED)) {
+		if (prefault && arch_wants_old_prefaulted_pte())
+			entry = pte_mkold(entry);
+		else
+			entry = pte_sw_mkyoung(entry);
+
+		if (write)
+			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+		if (unlikely(uffd_wp))
+			entry = pte_mkuffd_wp(entry);
+		set_pte_at(vma->vm_mm, addr, pte, entry);
+
+		/* no need to invalidate: a not-present page won't be cached */
+		update_mmu_cache(vma, addr, pte);
+	} while (pte++, page++, addr += PAGE_SIZE, --nr > 0);
+}
+
+void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
+{
+	struct folio *folio = page_folio(page);
+	struct vm_area_struct *vma = vmf->vma;
+	bool cow = (vmf->flags & FAULT_FLAG_WRITE) &&
+			!(vma->vm_flags & VM_SHARED);
+
+	if (cow) {
 		inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
 		page_add_new_anon_rmap(page, vma, addr);
 		lru_cache_add_inactive_or_unevictable(page, vma);
-	} else {
-		inc_mm_counter(vma->vm_mm, mm_counter_file(page));
-		page_add_file_rmap(page, vma, false);
 	}
-	set_pte_at(vma->vm_mm, addr, vmf->pte, entry);
+
+	do_set_pte_range(vmf, folio, addr, vmf->pte,
+			folio_page_idx(folio, page), 1);
 }
 
 static bool vmf_pte_changed(struct vm_fault *vmf)
@@ -4361,9 +4390,6 @@  vm_fault_t finish_fault(struct vm_fault *vmf)
 	if (likely(!vmf_pte_changed(vmf))) {
 		do_set_pte(vmf, page, vmf->address);
 
-		/* no need to invalidate: a not-present page won't be cached */
-		update_mmu_cache(vma, vmf->address, vmf->pte);
-
 		ret = 0;
 	} else {
 		update_mmu_tlb(vma, vmf->address, vmf->pte);