diff mbox series

[2/2] scftorture: Use a lock-less list to free memory.

Message ID 20241104105053.2182833-2-bigeasy@linutronix.de (mailing list archive)
State New
Headers show
Series [1/2] scftorture: Move memory allocation outside of preempt_disable region. | expand

Commit Message

Sebastian Andrzej Siewior Nov. 4, 2024, 10:50 a.m. UTC
scf_handler() is used as a SMP function call. This function is always
invoked in IRQ-context even with forced-threading enabled. This function
frees memory which not allowed on PREEMPT_RT because the locking
underneath is using sleeping locks.

Add a per-CPU scf_free_pool where each SMP functions adds its memory to
be freed. This memory is then freed by scftorture_invoker() on each
iteration. On the majority of invocations the number of items is less
than five. If the thread sleeps/ gets delayed the number exceed 350 but
did not reach 400 in testing. These were the spikes during testing.
The bulk free of 64 pointers at once should improve the give-back if the
list grows. The list size is ~1.3 items per invocations.

Having one global scf_free_pool with one cleaning thread let the list
grow to over 10.000 items with 32 CPUs (again, spikes not the average)
especially if the CPU went to sleep. The per-CPU part looks like a good
compromise.

Reported-by: "Paul E. McKenney" <paulmck@kernel.org>
Closes: https://lore.kernel.org/lkml/41619255-cdc2-4573-a360-7794fc3614f7@paulmck-laptop/
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 kernel/scftorture.c | 47 +++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 43 insertions(+), 4 deletions(-)

Comments

Boqun Feng Nov. 5, 2024, 1 a.m. UTC | #1
Hi Sebastian,

I love this approach, I think it's better than my workqueue work around,
a few comments below:

On Mon, Nov 04, 2024 at 11:50:53AM +0100, Sebastian Andrzej Siewior wrote:
> scf_handler() is used as a SMP function call. This function is always
> invoked in IRQ-context even with forced-threading enabled. This function
> frees memory which not allowed on PREEMPT_RT because the locking
> underneath is using sleeping locks.
> 
> Add a per-CPU scf_free_pool where each SMP functions adds its memory to
> be freed. This memory is then freed by scftorture_invoker() on each
> iteration. On the majority of invocations the number of items is less
> than five. If the thread sleeps/ gets delayed the number exceed 350 but
> did not reach 400 in testing. These were the spikes during testing.
> The bulk free of 64 pointers at once should improve the give-back if the
> list grows. The list size is ~1.3 items per invocations.
> 
> Having one global scf_free_pool with one cleaning thread let the list
> grow to over 10.000 items with 32 CPUs (again, spikes not the average)
> especially if the CPU went to sleep. The per-CPU part looks like a good
> compromise.
> 
> Reported-by: "Paul E. McKenney" <paulmck@kernel.org>
> Closes: https://lore.kernel.org/lkml/41619255-cdc2-4573-a360-7794fc3614f7@paulmck-laptop/
> Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
> ---
>  kernel/scftorture.c | 47 +++++++++++++++++++++++++++++++++++++++++----
>  1 file changed, 43 insertions(+), 4 deletions(-)
> 
> diff --git a/kernel/scftorture.c b/kernel/scftorture.c
> index e5546fe256329..ba9f1125821b8 100644
> --- a/kernel/scftorture.c
> +++ b/kernel/scftorture.c
> @@ -97,6 +97,7 @@ struct scf_statistics {
>  static struct scf_statistics *scf_stats_p;
>  static struct task_struct *scf_torture_stats_task;
>  static DEFINE_PER_CPU(long long, scf_invoked_count);
> +static DEFINE_PER_CPU(struct llist_head, scf_free_pool);
>  
>  // Data for random primitive selection
>  #define SCF_PRIM_RESCHED	0
> @@ -133,6 +134,7 @@ struct scf_check {
>  	bool scfc_wait;
>  	bool scfc_rpc;
>  	struct completion scfc_completion;
> +	struct llist_node scf_node;
>  };
>  
>  // Use to wait for all threads to start.
> @@ -148,6 +150,40 @@ static DEFINE_TORTURE_RANDOM_PERCPU(scf_torture_rand);
>  
>  extern void resched_cpu(int cpu); // An alternative IPI vector.
>  
> +static void scf_add_to_free_list(struct scf_check *scfcp)
> +{
> +	struct llist_head *pool;
> +	unsigned int cpu;
> +
> +	cpu = raw_smp_processor_id() % nthreads;
> +	pool = &per_cpu(scf_free_pool, cpu);
> +	llist_add(&scfcp->scf_node, pool);
> +}
> +
> +static void scf_cleanup_free_list(unsigned int cpu)
> +{
> +	struct llist_head *pool;
> +	struct llist_node *node;
> +	struct scf_check *scfcp;
> +	unsigned int slot = 0;
> +	void *free_pool[64];
> +
> +	pool = &per_cpu(scf_free_pool, cpu);
> +	node = llist_del_all(pool);
> +	while (node) {
> +		scfcp = llist_entry(node, struct scf_check, scf_node);
> +		node = node->next;
> +		free_pool[slot] = scfcp;
> +		slot++;
> +		if (slot == ARRAY_SIZE(free_pool)) {
> +			kfree_bulk(slot, free_pool);
> +			slot = 0;
> +		}
> +	}
> +	if (slot)
> +		kfree_bulk(slot, free_pool);
> +}
> +
>  // Print torture statistics.  Caller must ensure serialization.
>  static void scf_torture_stats_print(void)
>  {
> @@ -296,7 +332,7 @@ static void scf_handler(void *scfc_in)
>  		if (scfcp->scfc_rpc)
>  			complete(&scfcp->scfc_completion);
>  	} else {
> -		kfree(scfcp);
> +		scf_add_to_free_list(scfcp);
>  	}
>  }
>  
> @@ -363,7 +399,7 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra
>  				scfp->n_single_wait_ofl++;
>  			else
>  				scfp->n_single_ofl++;
> -			kfree(scfcp);
> +			scf_add_to_free_list(scfcp);
>  			scfcp = NULL;
>  		}
>  		break;
> @@ -391,7 +427,7 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra
>  				preempt_disable();
>  		} else {
>  			scfp->n_single_rpc_ofl++;
> -			kfree(scfcp);
> +			scf_add_to_free_list(scfcp);
>  			scfcp = NULL;
>  		}
>  		break;
> @@ -428,7 +464,7 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra
>  			pr_warn("%s: Memory-ordering failure, scfs_prim: %d.\n", __func__, scfsp->scfs_prim);
>  			atomic_inc(&n_mb_out_errs); // Leak rather than trash!
>  		} else {
> -			kfree(scfcp);
> +			scf_add_to_free_list(scfcp);
>  		}
>  		barrier(); // Prevent race-reduction compiler optimizations.
>  	}
> @@ -442,6 +478,7 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra
>  		schedule_timeout_uninterruptible(1);
>  }
>  
> +
>  // SCF test kthread.  Repeatedly does calls to members of the
>  // smp_call_function() family of functions.
>  static int scftorture_invoker(void *arg)
> @@ -479,6 +516,8 @@ static int scftorture_invoker(void *arg)
>  	VERBOSE_SCFTORTOUT("scftorture_invoker %d started", scfp->cpu);
>  
>  	do {
> +		scf_cleanup_free_list(scfp->cpu);
> +

I think this needs to be:

		scf_cleanup_free_list(cpu);

or

		scf_cleanup_free_list(curcpu);

because scfp->cpu is actually the thread number, and I got a NULL
dereference:

[   14.219225] BUG: unable to handle page fault for address: ffffffffb2ff7210

while running Paul's reproduce command:

tools/testing/selftests/rcutorture/bin/kvm.sh --torture scf --allcpus --duration 2 --configs PREEMPT --kconfig CONFIG_NR_CPUS=64 --memory 7G --trust-make --kasan --bootargs "scftorture.nthreads=64 torture.disable_onoff_at_boot csdlock_debug=1"

on my 48 cores VM (I think 48 core may be key to reproduce the NULL
dereference).


Another thing is, how do we guarantee that we don't exit the loop
eariler (i.e. while there are still callbacks on the list)? After the
following scftorture_invoke_one(), there could an IPI pending somewhere,
and we may exit this loop if torture_must_stop() is true. And that IPI
might add its scf_check to the list but no scf_cleanup_free_list() is
going to handle that, right?

Regards,
Boqun

>  		scftorture_invoke_one(scfp, &rand);
>  		while (cpu_is_offline(cpu) && !torture_must_stop()) {
>  			schedule_timeout_interruptible(HZ / 5);
> -- 
> 2.45.2
>
Sebastian Andrzej Siewior Nov. 7, 2024, 11:21 a.m. UTC | #2
On 2024-11-04 17:00:19 [-0800], Boqun Feng wrote:
> Hi Sebastian,
Hi Boqun,

…
> I think this needs to be:
> 
> 		scf_cleanup_free_list(cpu);
> 
> or
> 
> 		scf_cleanup_free_list(curcpu);
> 
> because scfp->cpu is actually the thread number, and I got a NULL
> dereference:
> 
> [   14.219225] BUG: unable to handle page fault for address: ffffffffb2ff7210

Right. Replaced with cpu.
…
> 
> Another thing is, how do we guarantee that we don't exit the loop
> eariler (i.e. while there are still callbacks on the list)? After the
> following scftorture_invoke_one(), there could an IPI pending somewhere,
> and we may exit this loop if torture_must_stop() is true. And that IPI
> might add its scf_check to the list but no scf_cleanup_free_list() is
> going to handle that, right?

Okay. Assuming that IPIs are done by the time scf_torture_cleanup is
invoked, I added scf_cleanup_free_list() for all CPUs there.

Reposted at
	https://lore.kernel.org/20241107111821.3417762-1-bigeasy@linutronix.de

> Regards,
> Boqun

Sebastian
Paul E. McKenney Nov. 7, 2024, 2:08 p.m. UTC | #3
On Thu, Nov 07, 2024 at 12:21:07PM +0100, Sebastian Andrzej Siewior wrote:
> On 2024-11-04 17:00:19 [-0800], Boqun Feng wrote:
> > Hi Sebastian,
> Hi Boqun,
> 
> …
> > I think this needs to be:
> > 
> > 		scf_cleanup_free_list(cpu);
> > 
> > or
> > 
> > 		scf_cleanup_free_list(curcpu);
> > 
> > because scfp->cpu is actually the thread number, and I got a NULL
> > dereference:
> > 
> > [   14.219225] BUG: unable to handle page fault for address: ffffffffb2ff7210
> 
> Right. Replaced with cpu.
> …
> > 
> > Another thing is, how do we guarantee that we don't exit the loop
> > eariler (i.e. while there are still callbacks on the list)? After the
> > following scftorture_invoke_one(), there could an IPI pending somewhere,
> > and we may exit this loop if torture_must_stop() is true. And that IPI
> > might add its scf_check to the list but no scf_cleanup_free_list() is
> > going to handle that, right?
> 
> Okay. Assuming that IPIs are done by the time scf_torture_cleanup is
> invoked, I added scf_cleanup_free_list() for all CPUs there.

This statement in scf_torture_cleanup() is supposed to wait for all
outstanding IPIs:

	smp_call_function(scf_cleanup_handler, NULL, 0);

And the scf_cleanup_handler() function is as follows:

	static void scf_cleanup_handler(void *unused)
	{
	}

Does that work, or am I yet again being overly naive?

> Reposted at
> 	https://lore.kernel.org/20241107111821.3417762-1-bigeasy@linutronix.de

Thank you!

I will do some testing on this later today.

							Thanx, Paul
Sebastian Andrzej Siewior Nov. 7, 2024, 2:43 p.m. UTC | #4
On 2024-11-07 06:08:35 [-0800], Paul E. McKenney wrote:
…
> This statement in scf_torture_cleanup() is supposed to wait for all
> outstanding IPIs:
> 
> 	smp_call_function(scf_cleanup_handler, NULL, 0);

This should be
	smp_call_function(scf_cleanup_handler, NULL, 1);

so it queues the function call and waits for its completion. Otherwise
it is queued and might be invoked _later_.

> And the scf_cleanup_handler() function is as follows:
> 
> 	static void scf_cleanup_handler(void *unused)
> 	{
> 	}
> 
> Does that work, or am I yet again being overly naive?

See above. I can send a patch later on if you have no other complains ;)

> 							Thanx, Paul

Sebastian
Paul E. McKenney Nov. 7, 2024, 2:59 p.m. UTC | #5
On Thu, Nov 07, 2024 at 03:43:00PM +0100, Sebastian Andrzej Siewior wrote:
> On 2024-11-07 06:08:35 [-0800], Paul E. McKenney wrote:
> …
> > This statement in scf_torture_cleanup() is supposed to wait for all
> > outstanding IPIs:
> > 
> > 	smp_call_function(scf_cleanup_handler, NULL, 0);
> 
> This should be
> 	smp_call_function(scf_cleanup_handler, NULL, 1);
> 
> so it queues the function call and waits for its completion. Otherwise
> it is queued and might be invoked _later_.
> 
> > And the scf_cleanup_handler() function is as follows:
> > 
> > 	static void scf_cleanup_handler(void *unused)
> > 	{
> > 	}
> > 
> > Does that work, or am I yet again being overly naive?
> 
> See above. I can send a patch later on if you have no other complains ;)

You got me on that one!  Thank you, and please do feel free to send
a patch.

Interestingly enough, this has never failed.  Perhaps because the usual
scftorture run injects enough idle time that all the IPIs have time
to finish.  ;-)

							Thanx, Paul
diff mbox series

Patch

diff --git a/kernel/scftorture.c b/kernel/scftorture.c
index e5546fe256329..ba9f1125821b8 100644
--- a/kernel/scftorture.c
+++ b/kernel/scftorture.c
@@ -97,6 +97,7 @@  struct scf_statistics {
 static struct scf_statistics *scf_stats_p;
 static struct task_struct *scf_torture_stats_task;
 static DEFINE_PER_CPU(long long, scf_invoked_count);
+static DEFINE_PER_CPU(struct llist_head, scf_free_pool);
 
 // Data for random primitive selection
 #define SCF_PRIM_RESCHED	0
@@ -133,6 +134,7 @@  struct scf_check {
 	bool scfc_wait;
 	bool scfc_rpc;
 	struct completion scfc_completion;
+	struct llist_node scf_node;
 };
 
 // Use to wait for all threads to start.
@@ -148,6 +150,40 @@  static DEFINE_TORTURE_RANDOM_PERCPU(scf_torture_rand);
 
 extern void resched_cpu(int cpu); // An alternative IPI vector.
 
+static void scf_add_to_free_list(struct scf_check *scfcp)
+{
+	struct llist_head *pool;
+	unsigned int cpu;
+
+	cpu = raw_smp_processor_id() % nthreads;
+	pool = &per_cpu(scf_free_pool, cpu);
+	llist_add(&scfcp->scf_node, pool);
+}
+
+static void scf_cleanup_free_list(unsigned int cpu)
+{
+	struct llist_head *pool;
+	struct llist_node *node;
+	struct scf_check *scfcp;
+	unsigned int slot = 0;
+	void *free_pool[64];
+
+	pool = &per_cpu(scf_free_pool, cpu);
+	node = llist_del_all(pool);
+	while (node) {
+		scfcp = llist_entry(node, struct scf_check, scf_node);
+		node = node->next;
+		free_pool[slot] = scfcp;
+		slot++;
+		if (slot == ARRAY_SIZE(free_pool)) {
+			kfree_bulk(slot, free_pool);
+			slot = 0;
+		}
+	}
+	if (slot)
+		kfree_bulk(slot, free_pool);
+}
+
 // Print torture statistics.  Caller must ensure serialization.
 static void scf_torture_stats_print(void)
 {
@@ -296,7 +332,7 @@  static void scf_handler(void *scfc_in)
 		if (scfcp->scfc_rpc)
 			complete(&scfcp->scfc_completion);
 	} else {
-		kfree(scfcp);
+		scf_add_to_free_list(scfcp);
 	}
 }
 
@@ -363,7 +399,7 @@  static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra
 				scfp->n_single_wait_ofl++;
 			else
 				scfp->n_single_ofl++;
-			kfree(scfcp);
+			scf_add_to_free_list(scfcp);
 			scfcp = NULL;
 		}
 		break;
@@ -391,7 +427,7 @@  static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra
 				preempt_disable();
 		} else {
 			scfp->n_single_rpc_ofl++;
-			kfree(scfcp);
+			scf_add_to_free_list(scfcp);
 			scfcp = NULL;
 		}
 		break;
@@ -428,7 +464,7 @@  static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra
 			pr_warn("%s: Memory-ordering failure, scfs_prim: %d.\n", __func__, scfsp->scfs_prim);
 			atomic_inc(&n_mb_out_errs); // Leak rather than trash!
 		} else {
-			kfree(scfcp);
+			scf_add_to_free_list(scfcp);
 		}
 		barrier(); // Prevent race-reduction compiler optimizations.
 	}
@@ -442,6 +478,7 @@  static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra
 		schedule_timeout_uninterruptible(1);
 }
 
+
 // SCF test kthread.  Repeatedly does calls to members of the
 // smp_call_function() family of functions.
 static int scftorture_invoker(void *arg)
@@ -479,6 +516,8 @@  static int scftorture_invoker(void *arg)
 	VERBOSE_SCFTORTOUT("scftorture_invoker %d started", scfp->cpu);
 
 	do {
+		scf_cleanup_free_list(scfp->cpu);
+
 		scftorture_invoke_one(scfp, &rand);
 		while (cpu_is_offline(cpu) && !torture_must_stop()) {
 			schedule_timeout_interruptible(HZ / 5);