diff mbox series

[1/1] mm/rmap: optimize MM-ID mapcount handling with union

Message ID 20250420055159.55851-1-lance.yang@linux.dev (mailing list archive)
State New
Headers show
Series [1/1] mm/rmap: optimize MM-ID mapcount handling with union | expand

Commit Message

Lance Yang April 20, 2025, 5:51 a.m. UTC
From: Lance Yang <lance.yang@linux.dev>

This patch moves '_mm_id_mapcount' into a union with an unsigned long
'_mm_id_mapcounts'. In that way, we can zap both slots to -1 at once via
-1UL, which is compatible with both 32/64-bit systems and makes compiler
happy without any memory/performance overhead.

Also, we remove the two MM_ID_DUMMY checks for each '_mm_id' slot and only
validate '_mm_ids' once.

Signed-off-by: Lance Yang <lance.yang@linux.dev>
---
 include/linux/mm_types.h |  6 +++++-
 include/linux/rmap.h     |  3 +--
 mm/internal.h            |  9 +++++++--
 mm/page_alloc.c          | 11 +++++------
 4 files changed, 18 insertions(+), 11 deletions(-)

Comments

David Hildenbrand April 20, 2025, 7:12 a.m. UTC | #1
>   	/* Note: mapcounts start at -1. */
>   	atomic_set(&folio->_large_mapcount, mapcount - 1);
> diff --git a/mm/internal.h b/mm/internal.h
> index 838f840ded83..1505174178f4 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -772,8 +772,13 @@ static inline void prep_compound_head(struct page *page, unsigned int order)
>   		atomic_set(&folio->_nr_pages_mapped, 0);
>   	if (IS_ENABLED(CONFIG_MM_ID)) {
>   		folio->_mm_ids = 0;
> -		folio->_mm_id_mapcount[0] = -1;
> -		folio->_mm_id_mapcount[1] = -1;
> +		/*
> +		 * One-shot initialization of both mapcount slots to -1.
> +		 * Using 'unsigned long' ensures cross-arch compatibility:
> +		 * - 32-bit: Fills both short slots (0xFFFF each)
> +		 * - 64-bit: Fills both int slots (0xFFFFFFFF each)
> +		 */
> +		folio->_mm_id_mapcounts = -1UL;

Are we sure the compiler cannot optimize that itself?

On x86-64 I get with gcc 14.2.1:


;               folio->_mm_id_mapcount[0] = -1;
     3f2f: 48 c7 42 60 ff ff ff ff       movq    $-0x1, 0x60(%rdx)

Which should be a quadword (64bit) setting, so exactly what you want to achieve.
Lance Yang April 20, 2025, 8:33 a.m. UTC | #2
April 20, 2025 at 3:12 PM, "David Hildenbrand" <david@redhat.com> wrote:



> 
> > 
> > /* Note: mapcounts start at -1. */
> > 
> >  atomic_set(&folio->_large_mapcount, mapcount - 1);
> > 
> >  diff --git a/mm/internal.h b/mm/internal.h
> > 
> >  index 838f840ded83..1505174178f4 100644
> > 
> >  --- a/mm/internal.h
> > 
> >  +++ b/mm/internal.h
> > 
> >  @@ -772,8 +772,13 @@ static inline void prep_compound_head(struct page *page, unsigned int order)
> > 
> >  atomic_set(&folio->_nr_pages_mapped, 0);
> > 
> >  if (IS_ENABLED(CONFIG_MM_ID)) {
> > 
> >  folio->_mm_ids = 0;
> > 
> >  - folio->_mm_id_mapcount[0] = -1;
> > 
> >  - folio->_mm_id_mapcount[1] = -1;
> > 
> >  + /*
> > 
> >  + * One-shot initialization of both mapcount slots to -1.
> > 
> >  + * Using 'unsigned long' ensures cross-arch compatibility:
> > 
> >  + * - 32-bit: Fills both short slots (0xFFFF each)
> > 
> >  + * - 64-bit: Fills both int slots (0xFFFFFFFF each)
> > 
> >  + */
> > 
> >  + folio->_mm_id_mapcounts = -1UL;
> > 
> 
> Are we sure the compiler cannot optimize that itself?
> 
> On x86-64 I get with gcc 14.2.1:
> 
> ; folio->_mm_id_mapcount[0] = -1;
> 
>  3f2f: 48 c7 42 60 ff ff ff ff movq $-0x1, 0x60(%rdx)
> 
> Which should be a quadword (64bit) setting, so exactly what you want to achieve.


Yeah, the compiler should be as smart as we expect it to be.
However, it seems that gcc 4.8.5 doesn't behave as expected
with the -O2 optimization level on the x86-64 test machine.

struct folio_array {
    int _mm_id_mapcount[2];
};

void init_array(struct folio_array *f) {
    f->_mm_id_mapcount[0] = -1;
    f->_mm_id_mapcount[1] = -1;
}

0000000000000000 <init_array>:
   0:	c7 07 ff ff ff ff    	movl   $0xffffffff,(%rdi)
   6:	c7 47 04 ff ff ff ff 	movl   $0xffffffff,0x4(%rdi)
   d:	c3                   	retq

---

struct folio_union {
    union {
        int _mm_id_mapcount[2];
        unsigned long _mm_id_mapcounts;
    };
};

void init_union(struct folio_union *f) {
    f->_mm_id_mapcounts = -1UL;
}

0000000000000010 <init_union>:
  10:	48 c7 07 ff ff ff ff 	movq   $0xffffffffffffffff,(%rdi)
  17:	c3                   	retq

Hmm... I'm not sure if it's valuable for those compilers that
are not very new.


Thanks,
Lance

> 
> -- Cheers,
> 
> David / dhildenb
>
diff mbox series

Patch

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 56d07edd01f9..0ac80eaa775e 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -363,6 +363,7 @@  typedef unsigned short mm_id_t;
  * @_mm_id: Do not use outside of rmap code.
  * @_mm_ids: Do not use outside of rmap code.
  * @_mm_id_mapcount: Do not use outside of rmap code.
+ * @_mm_id_mapcounts: Do not use outside of rmap code.
  * @_hugetlb_subpool: Do not use directly, use accessor in hugetlb.h.
  * @_hugetlb_cgroup: Do not use directly, use accessor in hugetlb_cgroup.h.
  * @_hugetlb_cgroup_rsvd: Do not use directly, use accessor in hugetlb_cgroup.h.
@@ -435,7 +436,10 @@  struct folio {
 					atomic_t _entire_mapcount;
 					atomic_t _pincount;
 #endif /* CONFIG_64BIT */
-					mm_id_mapcount_t _mm_id_mapcount[2];
+					union {
+						mm_id_mapcount_t _mm_id_mapcount[2];
+						unsigned long _mm_id_mapcounts;
+					};
 					union {
 						mm_id_t _mm_id[2];
 						unsigned long _mm_ids;
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 6b82b618846e..99c0518c1ad8 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -231,8 +231,7 @@  static __always_inline void folio_set_large_mapcount(struct folio *folio,
 {
 	__folio_large_mapcount_sanity_checks(folio, mapcount, vma->vm_mm->mm_id);
 
-	VM_WARN_ON_ONCE(folio_mm_id(folio, 0) != MM_ID_DUMMY);
-	VM_WARN_ON_ONCE(folio_mm_id(folio, 1) != MM_ID_DUMMY);
+	VM_WARN_ON_ONCE(folio->_mm_ids != MM_ID_DUMMY);
 
 	/* Note: mapcounts start at -1. */
 	atomic_set(&folio->_large_mapcount, mapcount - 1);
diff --git a/mm/internal.h b/mm/internal.h
index 838f840ded83..1505174178f4 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -772,8 +772,13 @@  static inline void prep_compound_head(struct page *page, unsigned int order)
 		atomic_set(&folio->_nr_pages_mapped, 0);
 	if (IS_ENABLED(CONFIG_MM_ID)) {
 		folio->_mm_ids = 0;
-		folio->_mm_id_mapcount[0] = -1;
-		folio->_mm_id_mapcount[1] = -1;
+		/*
+		 * One-shot initialization of both mapcount slots to -1.
+		 * Using 'unsigned long' ensures cross-arch compatibility:
+		 * - 32-bit: Fills both short slots (0xFFFF each)
+		 * - 64-bit: Fills both int slots (0xFFFFFFFF each)
+		 */
+		folio->_mm_id_mapcounts = -1UL;
 	}
 	if (IS_ENABLED(CONFIG_64BIT) || order > 1) {
 		atomic_set(&folio->_pincount, 0);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index bbfd8e4ce0df..f5c5829c4bfa 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -976,12 +976,11 @@  static int free_tail_page_prepare(struct page *head_page, struct page *page)
 			goto out;
 		}
 		if (IS_ENABLED(CONFIG_MM_ID)) {
-			if (unlikely(folio->_mm_id_mapcount[0] != -1)) {
-				bad_page(page, "nonzero mm mapcount 0");
-				goto out;
-			}
-			if (unlikely(folio->_mm_id_mapcount[1] != -1)) {
-				bad_page(page, "nonzero mm mapcount 1");
+			if (unlikely(folio->_mm_id_mapcounts != -1UL)) {
+				if (folio->_mm_id_mapcount[0] != -1)
+					bad_page(page, "nonzero mm mapcount 0");
+				if (folio->_mm_id_mapcount[1] != -1)
+					bad_page(page, "nonzero mm mapcount 1");
 				goto out;
 			}
 		}