diff mbox series

mm,page_owner: Fix refcount imbalance

Message ID 20240314144753.16276-1-osalvador@suse.de (mailing list archive)
State New
Headers show
Series mm,page_owner: Fix refcount imbalance | expand

Commit Message

Oscar Salvador March 14, 2024, 2:47 p.m. UTC
Current code does not contemplate scenarios were an allocation and
free operation on the same pages do not handle it in the same amount
at once.
To give an example, page_alloc_exact(), where we will allocate a page
of enough order to stafisfy the size request, but we will free the
remainings right away.

In the above example, we will increment the stack_record refcount
only once, but we will decrease it the same number of times as number
of unused pages we have to free.
This will lead to a warning because of refcount imbalance.

Fix this by recording the number of base pages every stack_record holds,
and only let the last decrementing of refcount succeed if the number of
base pages equals 0, which means we freed all the pages.

As a bonus, show the aggregate of stack_count + base_pages as this gives
a much better picture of the memory usage.

Signed-off-by: Oscar Salvador <osalvador@suse.de>
Fixes: 217b2119b9e2 ("mm,page_owner: implement the tracking of the stacks count")
---
 include/linux/stackdepot.h |  3 ++
 mm/page_owner.c            | 57 +++++++++++++++++++++++++++++++-------
 2 files changed, 50 insertions(+), 10 deletions(-)

Comments

Vlastimil Babka March 15, 2024, 1:21 p.m. UTC | #1
On 3/14/24 15:47, Oscar Salvador wrote:
> Current code does not contemplate scenarios were an allocation and
> free operation on the same pages do not handle it in the same amount
> at once.
> To give an example, page_alloc_exact(), where we will allocate a page
> of enough order to stafisfy the size request, but we will free the
> remainings right away.
> 
> In the above example, we will increment the stack_record refcount
> only once, but we will decrease it the same number of times as number
> of unused pages we have to free.
> This will lead to a warning because of refcount imbalance.
> 
> Fix this by recording the number of base pages every stack_record holds,
> and only let the last decrementing of refcount succeed if the number of
> base pages equals 0, which means we freed all the pages.
> 
> As a bonus, show the aggregate of stack_count + base_pages as this gives
> a much better picture of the memory usage.
> 
> Signed-off-by: Oscar Salvador <osalvador@suse.de>
> Fixes: 217b2119b9e2 ("mm,page_owner: implement the tracking of the stacks count")
> ---
>  include/linux/stackdepot.h |  3 ++
>  mm/page_owner.c            | 57 +++++++++++++++++++++++++++++++-------
>  2 files changed, 50 insertions(+), 10 deletions(-)
> 
> diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h
> index 3c6caa5abc7c..261472807c32 100644
> --- a/include/linux/stackdepot.h
> +++ b/include/linux/stackdepot.h
> @@ -57,6 +57,9 @@ struct stack_record {
>  	u32 size;			/* Number of stored frames */
>  	union handle_parts handle;	/* Constant after initialization */
>  	refcount_t count;
> +#ifdef CONFIG_PAGE_OWNER
> +	unsigned long nr_base_pages;
> +#endif

The stackdepot guys probably won't be thrilled about this addition to
stack_record. Can't we instead make the refcount itself reflect the number
of base pages?

...

> -static void dec_stack_record_count(depot_stack_handle_t handle)
> +static void dec_stack_record_count(depot_stack_handle_t handle,
> +				   unsigned long nr_base_pages)
>  {
>  	struct stack_record *stack_record = __stack_depot_get_stack_record(handle);
> +	unsigned long curr_nr_pages;
> +
> +	if (!stack_record)
> +		return;
> +
> +	curr_nr_pages = smp_load_acquire(&stack_record->nr_base_pages);
> +	smp_store_release(&stack_record->nr_base_pages,
> +			  curr_nr_pages - nr_base_pages);

If the intent of this is to have stack_record->nr_base_pages updated
atomically with respect to parallel updates, these smp_ operations won't
help I'm afraid.

> +	curr_nr_pages = smp_load_acquire(&stack_record->nr_base_pages);
> +
> +	/*
> +	 * If this stack_record is going to reach a refcount == 1, which means
> +	 * free, only do it if all the base pages it allocated were freed.
> +	 * E.g: scenarios like THP splitting, or alloc_pages_exact() can have
> +	 * an alloc/free operation with different amount of pages
> +	 */
> +	if (refcount_read(&stack_record->count) == 2 &&
> +	    curr_nr_pages)
> +		return;

This is very suspicious. We shouldn't manipulate refcount based on the other
counter. This suggest the refcount will eventually stop reflecting reality
as we could withold some legitimate decreases and not retry them afterwards?
Another reason to try making refcount itself represent nr_base_pages.


>  
> -	if (stack_record)
> -		refcount_dec(&stack_record->count);
> +	refcount_dec(&stack_record->count);

This refcount_read() followed by refcount_dec() is also potentially racy.

>  }
>  
>  void __reset_page_owner(struct page *page, unsigned short order)
> @@ -260,7 +294,7 @@ void __reset_page_owner(struct page *page, unsigned short order)
>  		 * the machinery is not ready yet, we cannot decrement
>  		 * their refcount either.
>  		 */
> -		dec_stack_record_count(alloc_handle);
> +		dec_stack_record_count(alloc_handle, 1UL << order);
>  }
>  
>  static inline void __set_page_owner_handle(struct page_ext *page_ext,
> @@ -303,7 +337,7 @@ noinline void __set_page_owner(struct page *page, unsigned short order,
>  	__set_page_owner_handle(page_ext, handle, order, gfp_mask);
>  	page_ext_put(page_ext);
>  	set_current_in_page_owner();
> -	inc_stack_record_count(handle, gfp_mask);
> +	inc_stack_record_count(handle, gfp_mask, 1UL << order);
>  	unset_current_in_page_owner();
>  }
>  
> @@ -868,6 +902,7 @@ static int stack_print(struct seq_file *m, void *v)
>  	struct stack *stack = v;
>  	unsigned long *entries;
>  	unsigned long nr_entries;
> +	unsigned long nr_base_pages;
>  	struct stack_record *stack_record = stack->stack_record;
>  
>  	if (!stack->stack_record)
> @@ -875,6 +910,7 @@ static int stack_print(struct seq_file *m, void *v)
>  
>  	nr_entries = stack_record->size;
>  	entries = stack_record->entries;
> +	nr_base_pages = stack_record->nr_base_pages;
>  	stack_count = refcount_read(&stack_record->count) - 1;
>  
>  	if (stack_count < 1 || stack_count < page_owner_stack_threshold)
> @@ -882,7 +918,8 @@ static int stack_print(struct seq_file *m, void *v)
>  
>  	for (i = 0; i < nr_entries; i++)
>  		seq_printf(m, " %pS\n", (void *)entries[i]);
> -	seq_printf(m, "stack_count: %d\n\n", stack_count);
> +	seq_printf(m, "stack_count: %d curr_nr_base_pages: %lu\n\n",
> +		   stack_count, nr_base_pages);
>  
>  	return 0;
>  }
Oscar Salvador March 15, 2024, 3:55 p.m. UTC | #2
On 2024-03-15 14:21, Vlastimil Babka wrote:
> The stackdepot guys probably won't be thrilled about this addition to
> stack_record. Can't we instead make the refcount itself reflect the 
> number
> of base pages?

Yes, we can. I already have a patch doing that, much simpler and much 
shorter.

I will send it out later today.

Thanks!
diff mbox series

Patch

diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h
index 3c6caa5abc7c..261472807c32 100644
--- a/include/linux/stackdepot.h
+++ b/include/linux/stackdepot.h
@@ -57,6 +57,9 @@  struct stack_record {
 	u32 size;			/* Number of stored frames */
 	union handle_parts handle;	/* Constant after initialization */
 	refcount_t count;
+#ifdef CONFIG_PAGE_OWNER
+	unsigned long nr_base_pages;
+#endif
 	union {
 		unsigned long entries[CONFIG_STACKDEPOT_MAX_FRAMES];	/* Frames */
 		struct {
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 50111078ecd9..5192449bb581 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -123,10 +123,14 @@  static __init void init_page_owner(void)
 	/* Initialize dummy and failure stacks and link them to stack_list */
 	dummy_stack.stack_record = __stack_depot_get_stack_record(dummy_handle);
 	failure_stack.stack_record = __stack_depot_get_stack_record(failure_handle);
-	if (dummy_stack.stack_record)
+	if (dummy_stack.stack_record) {
+		dummy_stack.stack_record->nr_base_pages = 0;
 		refcount_set(&dummy_stack.stack_record->count, 1);
-	if (failure_stack.stack_record)
+	}
+	if (failure_stack.stack_record) {
+		failure_stack.stack_record->nr_base_pages = 0;
 		refcount_set(&failure_stack.stack_record->count, 1);
+	}
 	dummy_stack.next = &failure_stack;
 	stack_list = &dummy_stack;
 }
@@ -192,9 +196,11 @@  static void add_stack_record_to_list(struct stack_record *stack_record,
 	spin_unlock_irqrestore(&stack_list_lock, flags);
 }
 
-static void inc_stack_record_count(depot_stack_handle_t handle, gfp_t gfp_mask)
+static void inc_stack_record_count(depot_stack_handle_t handle, gfp_t gfp_mask,
+				   unsigned long nr_base_pages)
 {
 	struct stack_record *stack_record = __stack_depot_get_stack_record(handle);
+	unsigned long curr_nr_pages;
 
 	if (!stack_record)
 		return;
@@ -209,19 +215,47 @@  static void inc_stack_record_count(depot_stack_handle_t handle, gfp_t gfp_mask)
 	if (refcount_read(&stack_record->count) == REFCOUNT_SATURATED) {
 		int old = REFCOUNT_SATURATED;
 
-		if (atomic_try_cmpxchg_relaxed(&stack_record->count.refs, &old, 1))
+		if (atomic_try_cmpxchg_relaxed(&stack_record->count.refs, &old, 1)) {
 			/* Add the new stack_record to our list */
 			add_stack_record_to_list(stack_record, gfp_mask);
+			smp_store_release(&stack_record->nr_base_pages,
+					  nr_base_pages);
+			goto inc;
+		}
 	}
+
+	curr_nr_pages = smp_load_acquire(&stack_record->nr_base_pages);
+	smp_store_release(&stack_record->nr_base_pages,
+			  curr_nr_pages + nr_base_pages);
+inc:
 	refcount_inc(&stack_record->count);
 }
 
-static void dec_stack_record_count(depot_stack_handle_t handle)
+static void dec_stack_record_count(depot_stack_handle_t handle,
+				   unsigned long nr_base_pages)
 {
 	struct stack_record *stack_record = __stack_depot_get_stack_record(handle);
+	unsigned long curr_nr_pages;
+
+	if (!stack_record)
+		return;
+
+	curr_nr_pages = smp_load_acquire(&stack_record->nr_base_pages);
+	smp_store_release(&stack_record->nr_base_pages,
+			  curr_nr_pages - nr_base_pages);
+	curr_nr_pages = smp_load_acquire(&stack_record->nr_base_pages);
+
+	/*
+	 * If this stack_record is going to reach a refcount == 1, which means
+	 * free, only do it if all the base pages it allocated were freed.
+	 * E.g: scenarios like THP splitting, or alloc_pages_exact() can have
+	 * an alloc/free operation with different amount of pages
+	 */
+	if (refcount_read(&stack_record->count) == 2 &&
+	    curr_nr_pages)
+		return;
 
-	if (stack_record)
-		refcount_dec(&stack_record->count);
+	refcount_dec(&stack_record->count);
 }
 
 void __reset_page_owner(struct page *page, unsigned short order)
@@ -260,7 +294,7 @@  void __reset_page_owner(struct page *page, unsigned short order)
 		 * the machinery is not ready yet, we cannot decrement
 		 * their refcount either.
 		 */
-		dec_stack_record_count(alloc_handle);
+		dec_stack_record_count(alloc_handle, 1UL << order);
 }
 
 static inline void __set_page_owner_handle(struct page_ext *page_ext,
@@ -303,7 +337,7 @@  noinline void __set_page_owner(struct page *page, unsigned short order,
 	__set_page_owner_handle(page_ext, handle, order, gfp_mask);
 	page_ext_put(page_ext);
 	set_current_in_page_owner();
-	inc_stack_record_count(handle, gfp_mask);
+	inc_stack_record_count(handle, gfp_mask, 1UL << order);
 	unset_current_in_page_owner();
 }
 
@@ -868,6 +902,7 @@  static int stack_print(struct seq_file *m, void *v)
 	struct stack *stack = v;
 	unsigned long *entries;
 	unsigned long nr_entries;
+	unsigned long nr_base_pages;
 	struct stack_record *stack_record = stack->stack_record;
 
 	if (!stack->stack_record)
@@ -875,6 +910,7 @@  static int stack_print(struct seq_file *m, void *v)
 
 	nr_entries = stack_record->size;
 	entries = stack_record->entries;
+	nr_base_pages = stack_record->nr_base_pages;
 	stack_count = refcount_read(&stack_record->count) - 1;
 
 	if (stack_count < 1 || stack_count < page_owner_stack_threshold)
@@ -882,7 +918,8 @@  static int stack_print(struct seq_file *m, void *v)
 
 	for (i = 0; i < nr_entries; i++)
 		seq_printf(m, " %pS\n", (void *)entries[i]);
-	seq_printf(m, "stack_count: %d\n\n", stack_count);
+	seq_printf(m, "stack_count: %d curr_nr_base_pages: %lu\n\n",
+		   stack_count, nr_base_pages);
 
 	return 0;
 }