diff mbox series

[RFC,v3] net/core: add optional threading for backlog processing

Message ID 20230219131006.92681-1-nbd@nbd.name (mailing list archive)
State RFC
Delegated to: Netdev Maintainers
Headers show
Series [RFC,v3] net/core: add optional threading for backlog processing | expand

Checks

Context Check Description
netdev/tree_selection success Guessed tree name to be net-next, async
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix warning Target tree name not specified in the subject
netdev/cover_letter success Single patches do not need cover letters
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 4351 this patch: 4351
netdev/cc_maintainers warning 1 maintainers not CCed: petrm@nvidia.com
netdev/build_clang fail Errors and warnings before: 1025 this patch: 1028
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 4562 this patch: 4562
netdev/checkpatch warning CHECK: Alignment should match open parenthesis CHECK: Blank lines aren't necessary before a close brace '}' WARNING: line length of 85 exceeds 80 columns WARNING: msleep < 20ms can sleep for up to 20ms; see Documentation/timers/timers-howto.rst
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Felix Fietkau Feb. 19, 2023, 1:10 p.m. UTC
When dealing with few flows or an imbalance on CPU utilization, static RPS
CPU assignment can be too inflexible. Add support for enabling threaded NAPI
for backlog processing in order to allow the scheduler to better balance
processing. This helps better spread the load across idle CPUs.

Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
RFC v3:
 - make patch more generic, applies to backlog processing in general
 - fix process queue access on flush
RFC v2:
 - fix rebase error in rps locking

 include/linux/netdevice.h  |  2 +
 net/core/dev.c             | 78 +++++++++++++++++++++++++++++++++++---
 net/core/sysctl_net_core.c | 27 +++++++++++++
 3 files changed, 102 insertions(+), 5 deletions(-)

Comments

Felix Fietkau Feb. 19, 2023, 3:08 p.m. UTC | #1
On 19.02.23 14:10, Felix Fietkau wrote:
> When dealing with few flows or an imbalance on CPU utilization, static RPS
> CPU assignment can be too inflexible. Add support for enabling threaded NAPI
> for backlog processing in order to allow the scheduler to better balance
> processing. This helps better spread the load across idle CPUs.
> 
> Signed-off-by: Felix Fietkau <nbd@nbd.name>
> ---
> RFC v3:
>   - make patch more generic, applies to backlog processing in general
>   - fix process queue access on flush
> RFC v2:
>   - fix rebase error in rps locking
> 
>   include/linux/netdevice.h  |  2 +
>   net/core/dev.c             | 78 +++++++++++++++++++++++++++++++++++---
>   net/core/sysctl_net_core.c | 27 +++++++++++++
>   3 files changed, 102 insertions(+), 5 deletions(-)
> 
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index d9cdbc047b49..b3cef91b1696 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -522,6 +522,7 @@ static inline bool napi_complete(struct napi_struct *n)
>   }
>   
>   int dev_set_threaded(struct net_device *dev, bool threaded);
> +int backlog_set_threaded(bool threaded);
>   
>   /**
>    *	napi_disable - prevent NAPI from scheduling
> @@ -3192,6 +3193,7 @@ struct softnet_data {
>   	unsigned int		cpu;
>   	unsigned int		input_queue_tail;
>   #endif
> +	unsigned int		process_queue_empty;
>   	unsigned int		received_rps;
>   	unsigned int		dropped;
>   	struct sk_buff_head	input_pkt_queue;
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 357081b0113c..76874513b7b5 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -4597,7 +4597,7 @@ static int napi_schedule_rps(struct softnet_data *sd)
>   	struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
>   
>   #ifdef CONFIG_RPS
> -	if (sd != mysd) {
> +	if (sd != mysd && !test_bit(NAPI_STATE_THREADED, &sd->backlog.state)) {
>   		sd->rps_ipi_next = mysd->rps_ipi_list;
>   		mysd->rps_ipi_list = sd;
>   
> @@ -5778,6 +5778,8 @@ static DEFINE_PER_CPU(struct work_struct, flush_works);
>   /* Network device is going away, flush any packets still pending */
>   static void flush_backlog(struct work_struct *work)
>   {
> +	unsigned int process_queue_empty;
> +	bool threaded, flush_processq;
>   	struct sk_buff *skb, *tmp;
>   	struct softnet_data *sd;
>   
> @@ -5792,8 +5794,15 @@ static void flush_backlog(struct work_struct *work)
>   			input_queue_head_incr(sd);
>   		}
>   	}
> +
> +	threaded = test_bit(NAPI_STATE_THREADED, &sd->backlog.state);
> +	flush_processq = threaded &&
> +			 !skb_queue_empty_lockless(&sd->process_queue);
Sorry, the patch was missing these lines:
	if (flush_processq)
		process_queue_empty = sd->process_queue_empty;

>   	rps_unlock_irq_enable(sd);
>   
> +	if (threaded)
> +		goto out;
> +
>   	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
>   		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
>   			__skb_unlink(skb, &sd->process_queue);
Eric Dumazet Feb. 19, 2023, 9 p.m. UTC | #2
On Sun, Feb 19, 2023 at 4:08 PM Felix Fietkau <nbd@nbd.name> wrote:
>
> On 19.02.23 14:10, Felix Fietkau wrote:
> > When dealing with few flows or an imbalance on CPU utilization, static RPS
> > CPU assignment can be too inflexible. Add support for enabling threaded NAPI
> > for backlog processing in order to allow the scheduler to better balance
> > processing. This helps better spread the load across idle CPUs.
> >
> > Signed-off-by: Felix Fietkau <nbd@nbd.name>


> Sorry, the patch was missing these lines:

Also make sure the following loop won't leak kthreads.

while :
do
 echo 1 >/proc/sys/net/core/backlog_threaded
 echo 0 >/proc/sys/net/core/backlog_threaded
done

Some documentation would also be welcomed (
Documentation/admin-guide/sysctl/net.rst &
Documentation/networking/scaling.rst )
diff mbox series

Patch

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index d9cdbc047b49..b3cef91b1696 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -522,6 +522,7 @@  static inline bool napi_complete(struct napi_struct *n)
 }
 
 int dev_set_threaded(struct net_device *dev, bool threaded);
+int backlog_set_threaded(bool threaded);
 
 /**
  *	napi_disable - prevent NAPI from scheduling
@@ -3192,6 +3193,7 @@  struct softnet_data {
 	unsigned int		cpu;
 	unsigned int		input_queue_tail;
 #endif
+	unsigned int		process_queue_empty;
 	unsigned int		received_rps;
 	unsigned int		dropped;
 	struct sk_buff_head	input_pkt_queue;
diff --git a/net/core/dev.c b/net/core/dev.c
index 357081b0113c..76874513b7b5 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4597,7 +4597,7 @@  static int napi_schedule_rps(struct softnet_data *sd)
 	struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
 
 #ifdef CONFIG_RPS
-	if (sd != mysd) {
+	if (sd != mysd && !test_bit(NAPI_STATE_THREADED, &sd->backlog.state)) {
 		sd->rps_ipi_next = mysd->rps_ipi_list;
 		mysd->rps_ipi_list = sd;
 
@@ -5778,6 +5778,8 @@  static DEFINE_PER_CPU(struct work_struct, flush_works);
 /* Network device is going away, flush any packets still pending */
 static void flush_backlog(struct work_struct *work)
 {
+	unsigned int process_queue_empty;
+	bool threaded, flush_processq;
 	struct sk_buff *skb, *tmp;
 	struct softnet_data *sd;
 
@@ -5792,8 +5794,15 @@  static void flush_backlog(struct work_struct *work)
 			input_queue_head_incr(sd);
 		}
 	}
+
+	threaded = test_bit(NAPI_STATE_THREADED, &sd->backlog.state);
+	flush_processq = threaded &&
+			 !skb_queue_empty_lockless(&sd->process_queue);
 	rps_unlock_irq_enable(sd);
 
+	if (threaded)
+		goto out;
+
 	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
 		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
 			__skb_unlink(skb, &sd->process_queue);
@@ -5801,7 +5810,16 @@  static void flush_backlog(struct work_struct *work)
 			input_queue_head_incr(sd);
 		}
 	}
+
+out:
 	local_bh_enable();
+
+	while (flush_processq) {
+		msleep(1);
+		rps_lock_irq_disable(sd);
+		flush_processq = process_queue_empty == sd->process_queue_empty;
+		rps_unlock_irq_enable(sd);
+	}
 }
 
 static bool flush_required(int cpu)
@@ -5933,16 +5951,16 @@  static int process_backlog(struct napi_struct *napi, int quota)
 		}
 
 		rps_lock_irq_disable(sd);
+		sd->process_queue_empty++;
 		if (skb_queue_empty(&sd->input_pkt_queue)) {
 			/*
 			 * Inline a custom version of __napi_complete().
-			 * only current cpu owns and manipulates this napi,
-			 * and NAPI_STATE_SCHED is the only possible flag set
-			 * on backlog.
+			 * only current cpu owns and manipulates this napi.
 			 * We can use a plain write instead of clear_bit(),
 			 * and we dont need an smp_mb() memory barrier.
 			 */
-			napi->state = 0;
+			napi->state &= ~(NAPIF_STATE_SCHED |
+					 NAPIF_STATE_SCHED_THREADED);
 			again = false;
 		} else {
 			skb_queue_splice_tail_init(&sd->input_pkt_queue,
@@ -6356,6 +6374,53 @@  int dev_set_threaded(struct net_device *dev, bool threaded)
 }
 EXPORT_SYMBOL(dev_set_threaded);
 
+int backlog_set_threaded(bool threaded)
+{
+	static bool backlog_threaded;
+	int err = 0;
+	int i;
+
+	if (backlog_threaded == threaded)
+		return 0;
+
+	for_each_possible_cpu(i) {
+		struct softnet_data *sd = &per_cpu(softnet_data, i);
+		struct napi_struct *n = &sd->backlog;
+
+		n->thread = kthread_run(napi_threaded_poll, n, "napi/backlog-%d", i);
+		if (IS_ERR(n->thread)) {
+			err = PTR_ERR(n->thread);
+			pr_err("kthread_run failed with err %d\n", err);
+			n->thread = NULL;
+			threaded = false;
+			break;
+		}
+
+	}
+
+	backlog_threaded = threaded;
+
+	/* Make sure kthread is created before THREADED bit
+	 * is set.
+	 */
+	smp_mb__before_atomic();
+
+	for_each_possible_cpu(i) {
+		struct softnet_data *sd = &per_cpu(softnet_data, i);
+		struct napi_struct *n = &sd->backlog;
+		unsigned long flags;
+
+		rps_lock_irqsave(sd, &flags);
+		if (threaded)
+			n->state |= NAPIF_STATE_THREADED;
+		else
+			n->state &= ~NAPIF_STATE_THREADED;
+		rps_unlock_irq_restore(sd, &flags);
+	}
+
+	return err;
+}
+
 void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
 			   int (*poll)(struct napi_struct *, int), int weight)
 {
@@ -11114,6 +11179,9 @@  static int dev_cpu_dead(unsigned int oldcpu)
 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
 	local_irq_enable();
 
+	if (test_bit(NAPI_STATE_THREADED, &oldsd->backlog.state))
+		return 0;
+
 #ifdef CONFIG_RPS
 	remsd = oldsd->rps_ipi_list;
 	oldsd->rps_ipi_list = NULL;
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 7130e6d9e263..3eea703b69d7 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -30,6 +30,7 @@  static int int_3600 = 3600;
 static int min_sndbuf = SOCK_MIN_SNDBUF;
 static int min_rcvbuf = SOCK_MIN_RCVBUF;
 static int max_skb_frags = MAX_SKB_FRAGS;
+static int backlog_threaded;
 
 static int net_msg_warn;	/* Unused, but still a sysctl */
 
@@ -165,6 +166,23 @@  static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
 }
 #endif /* CONFIG_RPS */
 
+static int backlog_threaded_sysctl(struct ctl_table *table, int write,
+			       void *buffer, size_t *lenp, loff_t *ppos)
+{
+	static DEFINE_MUTEX(backlog_threaded_mutex);
+	int ret;
+
+	mutex_lock(&backlog_threaded_mutex);
+
+	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+	if (write && !ret)
+		ret = backlog_set_threaded(backlog_threaded);
+
+	mutex_unlock(&backlog_threaded_mutex);
+
+	return ret;
+}
+
 #ifdef CONFIG_NET_FLOW_LIMIT
 static DEFINE_MUTEX(flow_limit_update_mutex);
 
@@ -514,6 +532,15 @@  static struct ctl_table net_core_table[] = {
 		.proc_handler	= rps_default_mask_sysctl
 	},
 #endif
+	{
+		.procname	= "backlog_threaded",
+		.data		= &backlog_threaded,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= backlog_threaded_sysctl,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE
+	},
 #ifdef CONFIG_NET_FLOW_LIMIT
 	{
 		.procname	= "flow_limit_cpu_bitmap",