@@ -1115,7 +1115,6 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pmdp_set_wrprotect(src_mm, addr, src_pmd);
pmd = pmd_wrprotect(pmd);
}
- pmd = pmd_mkold(pmd);
set_pmd_at(dst_mm, addr, dst_pmd, pmd);
ret = 0;
@@ -1225,7 +1224,6 @@ int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pudp_set_wrprotect(src_mm, addr, src_pud);
pud = pud_mkold(pud_wrprotect(pud));
}
- pud = pud_mkold(pud);
set_pud_at(dst_mm, addr, dst_pud, pud);
ret = 0;
@@ -886,7 +886,6 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
*/
if (vm_flags & VM_SHARED)
pte = pte_mkclean(pte);
- pte = pte_mkold(pte);
/*
* Make sure the _PAGE_UFFD_WP bit is cleared if the new VMA
@@ -1012,6 +1012,11 @@ static enum page_references page_check_references(struct page *page,
* Note: the mark is set for activated pages as well
* so that recently deactivated but used pages are
* quickly recovered.
+ *
+ * Note: fork() will copy referenced bit from parent
+ * to child ptes, despite not having been accessed by
+ * the child. This is to avoid micro-faults on initial
+ * access.
*/
SetPageReferenced(page);
fork clears dirty/accessed bits from new ptes in the child. This logic has existed since mapped page reclaim was done by scanning ptes when it may have been quite important. Today with physical based pte scanning, there is less reason to clear these bits, so this patch avoids clearing the accessed bit in the child. Any accessed bit is treated similarly to many, with the difference today with > 1 referenced bit causing the page to be activated, while 1 bit causes it to be kept. This patch causes pages shared by fork(2) to be more readily activated, but this heuristic is very fuzzy anyway -- a page can be accessed by multiple threads via a single pte and be just as important as one that is accessed via multiple ptes, for example. In the end I don't believe fork(2) is a significant driver of page reclaim behaviour that this should matter too much. This and the following change eliminate a major source of faults that powerpc/radix requires to set dirty/accessed bits in ptes, speeding up a fork/exit microbenchmark by about 5% on POWER9 (16600 -> 17500 fork/execs per second). Skylake appears to have a micro-fault overhead too -- a test which allocates 4GB anonymous memory, reads each page, then forks, and times the child reading a byte from each page. The first pass over the pages takes about 1000 cycles per page, the second pass takes about 27 cycles (TLB miss). With no additional minor faults measured due to either child pass, and the page array well exceeding TLB capacity, the large cost must be micro faults caused by setting the accessed bit. Signed-off-by: Nicholas Piggin <npiggin@gmail.com> --- mm/huge_memory.c | 2 -- mm/memory.c | 1 - mm/vmscan.c | 5 +++++ 3 files changed, 5 insertions(+), 3 deletions(-)