diff mbox

Lockup/High ksoftirqd when rate-limiting is enabled

Message ID 20170620111849.aiouc66mps3jbjvo@citrix.com (mailing list archive)
State New, archived
Headers show

Commit Message

Wei Liu June 20, 2017, 11:18 a.m. UTC
On Tue, Jun 20, 2017 at 11:31:02AM +0200, Jean-Louis Dupond wrote:
> Hi,
> 
> As requested via IRC i'm sending this to xen-devel & netback maintainers.
> 
> We are using Xen 4.4.4-23.el6 with kernel 3.18.44-20.el6.x86_64.
> Now recently we're having issues with rate-limiting enabled.
> 
> When we enable rate limiting in Xen, and then do alot of outbound traffic on
> the domU, we notice a high ksoftirqd load.
> But in some cases the system locks up completely.
> 

Can you give this patch a try?

---8<--
From a242d4a74cc4ec46c5e3d43dd07eb146be4ca233 Mon Sep 17 00:00:00 2001
From: Wei Liu <wei.liu2@citrix.com>
Date: Tue, 20 Jun 2017 11:49:28 +0100
Subject: [PATCH] xen-netback: correctly schedule rate-limited queues

Add a flag to indicate if a queue is rate-limited. Test the flag in
NAPI poll handler and avoid rescheduling the queue if true, otherwise
we risk locking up the host. The rescheduling shall be done when
replenishing credit.

Reported-by: Jean-Louis Dupond <jean-louis@dupond.be>
Signed-off-by: Wei Liu <wei.liu2@citrix.com>
---
 drivers/net/xen-netback/common.h    | 1 +
 drivers/net/xen-netback/interface.c | 6 +++++-
 drivers/net/xen-netback/netback.c   | 6 +++++-
 3 files changed, 11 insertions(+), 2 deletions(-)

Comments

Jean-Louis Dupond June 21, 2017, 8:35 a.m. UTC | #1
Thanks for this quick patch.
I was able to test it today, and the high ksoftirqd cpu usage is gone.

Great!

Is there a chance this can get pushed into stable kernel versions 
(3.18.x, 4.4.x, etc)?
There is not really a backport work, as the netback driver hasn't 
changed alot recently.


Tested-by: Jean-Louis Dupond <jean-louis@dupond.be>


Op 2017-06-20 13:18, schreef Wei Liu:
> On Tue, Jun 20, 2017 at 11:31:02AM +0200, Jean-Louis Dupond wrote:
>> Hi,
>> 
>> As requested via IRC i'm sending this to xen-devel & netback 
>> maintainers.
>> 
>> We are using Xen 4.4.4-23.el6 with kernel 3.18.44-20.el6.x86_64.
>> Now recently we're having issues with rate-limiting enabled.
>> 
>> When we enable rate limiting in Xen, and then do alot of outbound 
>> traffic on
>> the domU, we notice a high ksoftirqd load.
>> But in some cases the system locks up completely.
>> 
> 
> Can you give this patch a try?
> 
> ---8<--
> From a242d4a74cc4ec46c5e3d43dd07eb146be4ca233 Mon Sep 17 00:00:00 2001
> From: Wei Liu <wei.liu2@citrix.com>
> Date: Tue, 20 Jun 2017 11:49:28 +0100
> Subject: [PATCH] xen-netback: correctly schedule rate-limited queues
> 
> Add a flag to indicate if a queue is rate-limited. Test the flag in
> NAPI poll handler and avoid rescheduling the queue if true, otherwise
> we risk locking up the host. The rescheduling shall be done when
> replenishing credit.
> 
> Reported-by: Jean-Louis Dupond <jean-louis@dupond.be>
> Signed-off-by: Wei Liu <wei.liu2@citrix.com>
> ---
>  drivers/net/xen-netback/common.h    | 1 +
>  drivers/net/xen-netback/interface.c | 6 +++++-
>  drivers/net/xen-netback/netback.c   | 6 +++++-
>  3 files changed, 11 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/net/xen-netback/common.h 
> b/drivers/net/xen-netback/common.h
> index 530586be05b4..5b1d2e8402d9 100644
> --- a/drivers/net/xen-netback/common.h
> +++ b/drivers/net/xen-netback/common.h
> @@ -199,6 +199,7 @@ struct xenvif_queue { /* Per-queue data for xenvif 
> */
>  	unsigned long   remaining_credit;
>  	struct timer_list credit_timeout;
>  	u64 credit_window_start;
> +	bool rate_limited;
> 
>  	/* Statistics */
>  	struct xenvif_stats stats;
> diff --git a/drivers/net/xen-netback/interface.c
> b/drivers/net/xen-netback/interface.c
> index 8397f6c92451..e322a862ddfe 100644
> --- a/drivers/net/xen-netback/interface.c
> +++ b/drivers/net/xen-netback/interface.c
> @@ -106,7 +106,11 @@ static int xenvif_poll(struct napi_struct *napi,
> int budget)
> 
>  	if (work_done < budget) {
>  		napi_complete_done(napi, work_done);
> -		xenvif_napi_schedule_or_enable_events(queue);
> +		/* If the queue is rate-limited, it shall be
> +		 * rescheduled in the timer callback.
> +		 */
> +		if (likely(!queue->rate_limited))
> +			xenvif_napi_schedule_or_enable_events(queue);
>  	}
> 
>  	return work_done;
> diff --git a/drivers/net/xen-netback/netback.c
> b/drivers/net/xen-netback/netback.c
> index 602d408fa25e..5042ff8d449a 100644
> --- a/drivers/net/xen-netback/netback.c
> +++ b/drivers/net/xen-netback/netback.c
> @@ -180,6 +180,7 @@ static void tx_add_credit(struct xenvif_queue 
> *queue)
>  		max_credit = ULONG_MAX; /* wrapped: clamp to ULONG_MAX */
> 
>  	queue->remaining_credit = min(max_credit, max_burst);
> +	queue->rate_limited = false;
>  }
> 
>  void xenvif_tx_credit_callback(unsigned long data)
> @@ -686,8 +687,10 @@ static bool tx_credit_exceeded(struct
> xenvif_queue *queue, unsigned size)
>  		msecs_to_jiffies(queue->credit_usec / 1000);
> 
>  	/* Timer could already be pending in rare cases. */
> -	if (timer_pending(&queue->credit_timeout))
> +	if (timer_pending(&queue->credit_timeout)) {
> +		queue->rate_limited = true;
>  		return true;
> +	}
> 
>  	/* Passed the point where we can replenish credit? */
>  	if (time_after_eq64(now, next_credit)) {
> @@ -702,6 +705,7 @@ static bool tx_credit_exceeded(struct xenvif_queue
> *queue, unsigned size)
>  		mod_timer(&queue->credit_timeout,
>  			  next_credit);
>  		queue->credit_window_start = next_credit;
> +		queue->rate_limited = true;
> 
>  		return true;
>  	}
Wei Liu June 21, 2017, 9:10 a.m. UTC | #2
On Wed, Jun 21, 2017 at 10:35:11AM +0200, Jean-Louis Dupond wrote:
> Thanks for this quick patch.
> I was able to test it today, and the high ksoftirqd cpu usage is gone.
> 
> Great!
> 
> Is there a chance this can get pushed into stable kernel versions (3.18.x,
> 4.4.x, etc)?
> There is not really a backport work, as the netback driver hasn't changed
> alot recently.

3.18 is EOL. I think it will eventually trickle down to all 4.X longterm
kernels.

> 
> 
> Tested-by: Jean-Louis Dupond <jean-louis@dupond.be>
> 

Thanks. I will submit this to netdev soon.
diff mbox

Patch

diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h
index 530586be05b4..5b1d2e8402d9 100644
--- a/drivers/net/xen-netback/common.h
+++ b/drivers/net/xen-netback/common.h
@@ -199,6 +199,7 @@  struct xenvif_queue { /* Per-queue data for xenvif */
 	unsigned long   remaining_credit;
 	struct timer_list credit_timeout;
 	u64 credit_window_start;
+	bool rate_limited;
 
 	/* Statistics */
 	struct xenvif_stats stats;
diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
index 8397f6c92451..e322a862ddfe 100644
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c
@@ -106,7 +106,11 @@  static int xenvif_poll(struct napi_struct *napi, int budget)
 
 	if (work_done < budget) {
 		napi_complete_done(napi, work_done);
-		xenvif_napi_schedule_or_enable_events(queue);
+		/* If the queue is rate-limited, it shall be
+		 * rescheduled in the timer callback.
+		 */
+		if (likely(!queue->rate_limited))
+			xenvif_napi_schedule_or_enable_events(queue);
 	}
 
 	return work_done;
diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index 602d408fa25e..5042ff8d449a 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -180,6 +180,7 @@  static void tx_add_credit(struct xenvif_queue *queue)
 		max_credit = ULONG_MAX; /* wrapped: clamp to ULONG_MAX */
 
 	queue->remaining_credit = min(max_credit, max_burst);
+	queue->rate_limited = false;
 }
 
 void xenvif_tx_credit_callback(unsigned long data)
@@ -686,8 +687,10 @@  static bool tx_credit_exceeded(struct xenvif_queue *queue, unsigned size)
 		msecs_to_jiffies(queue->credit_usec / 1000);
 
 	/* Timer could already be pending in rare cases. */
-	if (timer_pending(&queue->credit_timeout))
+	if (timer_pending(&queue->credit_timeout)) {
+		queue->rate_limited = true;
 		return true;
+	}
 
 	/* Passed the point where we can replenish credit? */
 	if (time_after_eq64(now, next_credit)) {
@@ -702,6 +705,7 @@  static bool tx_credit_exceeded(struct xenvif_queue *queue, unsigned size)
 		mod_timer(&queue->credit_timeout,
 			  next_credit);
 		queue->credit_window_start = next_credit;
+		queue->rate_limited = true;
 
 		return true;
 	}