diff mbox series

[v2,net-next,6/8] net: rps: change input_queue_tail_incr_save()

Message ID 20240329154225.349288-7-edumazet@google.com (mailing list archive)
State Accepted
Delegated to: Netdev Maintainers
Headers show
Series net: rps: misc changes | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for net-next, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 4889 this patch: 4889
netdev/build_tools success Errors and warnings before: 0 this patch: 0
netdev/cc_maintainers success CCed 4 of 4 maintainers
netdev/build_clang success Errors and warnings before: 1057 this patch: 1057
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 5167 this patch: 5167
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 120 lines checked
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
netdev/contest success net-next-2024-03-31--03-00 (tests: 953)

Commit Message

Eric Dumazet March 29, 2024, 3:42 p.m. UTC
input_queue_tail_incr_save() is incrementing the sd queue_tail
and save it in the flow last_qtail.

Two issues here :

- no lock protects the write on last_qtail, we should use appropriate
  annotations.

- We can perform this write after releasing the per-cpu backlog lock,
  to decrease this lock hold duration (move away the cache line miss)

Also move input_queue_head_incr() and rps helpers to include/net/rps.h,
while adding rps_ prefix to better reflect their role.

v2: Fixed a build issue (Jakub and kernel build bots)

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 include/linux/netdevice.h | 15 ---------------
 include/net/rps.h         | 23 +++++++++++++++++++++++
 net/core/dev.c            | 20 ++++++++++++--------
 3 files changed, 35 insertions(+), 23 deletions(-)

Comments

Jason Xing March 30, 2024, 2:46 p.m. UTC | #1
Hello Eric,

On Fri, Mar 29, 2024 at 11:43 PM Eric Dumazet <edumazet@google.com> wrote:
>
> input_queue_tail_incr_save() is incrementing the sd queue_tail
> and save it in the flow last_qtail.
>
> Two issues here :
>
> - no lock protects the write on last_qtail, we should use appropriate
>   annotations.
>
> - We can perform this write after releasing the per-cpu backlog lock,
>   to decrease this lock hold duration (move away the cache line miss)
>
> Also move input_queue_head_incr() and rps helpers to include/net/rps.h,
> while adding rps_ prefix to better reflect their role.
>
> v2: Fixed a build issue (Jakub and kernel build bots)
>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> ---
>  include/linux/netdevice.h | 15 ---------------
>  include/net/rps.h         | 23 +++++++++++++++++++++++
>  net/core/dev.c            | 20 ++++++++++++--------
>  3 files changed, 35 insertions(+), 23 deletions(-)
>
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 1c31cd2691d32064613836141fbdeeebc831b21f..14f19cc2616452d7e6afbbaa52f8ad3e61a419e9 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -3249,21 +3249,6 @@ struct softnet_data {
>         call_single_data_t      defer_csd;
>  };
>
> -static inline void input_queue_head_incr(struct softnet_data *sd)
> -{
> -#ifdef CONFIG_RPS
> -       sd->input_queue_head++;
> -#endif
> -}
> -
> -static inline void input_queue_tail_incr_save(struct softnet_data *sd,
> -                                             unsigned int *qtail)
> -{
> -#ifdef CONFIG_RPS
> -       *qtail = ++sd->input_queue_tail;
> -#endif
> -}
> -
>  DECLARE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
>
>  static inline int dev_recursion_level(void)
> diff --git a/include/net/rps.h b/include/net/rps.h
> index 7660243e905b92651a41292e04caf72c5f12f26e..10ca25731c1ef766715fe7ee415ad0b71ec643a8 100644
> --- a/include/net/rps.h
> +++ b/include/net/rps.h
> @@ -122,4 +122,27 @@ static inline void sock_rps_record_flow(const struct sock *sk)
>  #endif
>  }
>
> +static inline u32 rps_input_queue_tail_incr(struct softnet_data *sd)
> +{
> +#ifdef CONFIG_RPS
> +       return ++sd->input_queue_tail;
> +#else
> +       return 0;
> +#endif
> +}
> +
> +static inline void rps_input_queue_tail_save(u32 *dest, u32 tail)
> +{
> +#ifdef CONFIG_RPS
> +       WRITE_ONCE(*dest, tail);
> +#endif
> +}

I wonder if we should also call this new helper to WRITE_ONCE
last_qtail in the set_rps_cpu()?

Thanks,
Jason

> +
> +static inline void rps_input_queue_head_incr(struct softnet_data *sd)
> +{
> +#ifdef CONFIG_RPS
> +       sd->input_queue_head++;
> +#endif
> +}
> +
>  #endif /* _NET_RPS_H */
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 0a8ccb0451c30a39f8f8b45d26b7e5548b8bfba4..79073bbc9a644049cacf8433310f4641745049e9 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -4611,7 +4611,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
>                 if (unlikely(tcpu != next_cpu) &&
>                     (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
>                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
> -                     rflow->last_qtail)) >= 0)) {
> +                     READ_ONCE(rflow->last_qtail))) >= 0)) {
>                         tcpu = next_cpu;
>                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
>                 }
> @@ -4666,7 +4666,7 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
>                 cpu = READ_ONCE(rflow->cpu);
>                 if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
>                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
> -                          rflow->last_qtail) <
> +                          READ_ONCE(rflow->last_qtail)) <
>                      (int)(10 * flow_table->mask)))
>                         expire = false;
>         }
> @@ -4801,6 +4801,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
>         unsigned long flags;
>         unsigned int qlen;
>         int max_backlog;
> +       u32 tail;
>
>         reason = SKB_DROP_REASON_DEV_READY;
>         if (!netif_running(skb->dev))
> @@ -4825,8 +4826,11 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
>                                 napi_schedule_rps(sd);
>                 }
>                 __skb_queue_tail(&sd->input_pkt_queue, skb);
> -               input_queue_tail_incr_save(sd, qtail);
> +               tail = rps_input_queue_tail_incr(sd);
>                 backlog_unlock_irq_restore(sd, &flags);
> +
> +               /* save the tail outside of the critical section */
> +               rps_input_queue_tail_save(qtail, tail);
>                 return NET_RX_SUCCESS;
>         }
>
> @@ -5904,7 +5908,7 @@ static void flush_backlog(struct work_struct *work)
>                 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
>                         __skb_unlink(skb, &sd->input_pkt_queue);
>                         dev_kfree_skb_irq(skb);
> -                       input_queue_head_incr(sd);
> +                       rps_input_queue_head_incr(sd);
>                 }
>         }
>         backlog_unlock_irq_enable(sd);
> @@ -5913,7 +5917,7 @@ static void flush_backlog(struct work_struct *work)
>                 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
>                         __skb_unlink(skb, &sd->process_queue);
>                         kfree_skb(skb);
> -                       input_queue_head_incr(sd);
> +                       rps_input_queue_head_incr(sd);
>                 }
>         }
>         local_bh_enable();
> @@ -6041,7 +6045,7 @@ static int process_backlog(struct napi_struct *napi, int quota)
>                         rcu_read_lock();
>                         __netif_receive_skb(skb);
>                         rcu_read_unlock();
> -                       input_queue_head_incr(sd);
> +                       rps_input_queue_head_incr(sd);
>                         if (++work >= quota)
>                                 return work;
>
> @@ -11455,11 +11459,11 @@ static int dev_cpu_dead(unsigned int oldcpu)
>         /* Process offline CPU's input_pkt_queue */
>         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
>                 netif_rx(skb);
> -               input_queue_head_incr(oldsd);
> +               rps_input_queue_head_incr(oldsd);
>         }
>         while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
>                 netif_rx(skb);
> -               input_queue_head_incr(oldsd);
> +               rps_input_queue_head_incr(oldsd);
>         }
>
>         return 0;
> --
> 2.44.0.478.gd926399ef9-goog
>
>
Eric Dumazet March 30, 2024, 4:01 p.m. UTC | #2
On Sat, Mar 30, 2024 at 3:47 PM Jason Xing <kerneljasonxing@gmail.com> wrote:
>
> Hello Eric,
>
> On Fri, Mar 29, 2024 at 11:43 PM Eric Dumazet <edumazet@google.com> wrote:
> >
> > input_queue_tail_incr_save() is incrementing the sd queue_tail
> > and save it in the flow last_qtail.
> >
> > Two issues here :
> >
> > - no lock protects the write on last_qtail, we should use appropriate
> >   annotations.
> >
> > - We can perform this write after releasing the per-cpu backlog lock,
> >   to decrease this lock hold duration (move away the cache line miss)
> >
> > Also move input_queue_head_incr() and rps helpers to include/net/rps.h,
> > while adding rps_ prefix to better reflect their role.
> >
> > v2: Fixed a build issue (Jakub and kernel build bots)
> >
> > Signed-off-by: Eric Dumazet <edumazet@google.com>
> > ---
> >  include/linux/netdevice.h | 15 ---------------
> >  include/net/rps.h         | 23 +++++++++++++++++++++++
> >  net/core/dev.c            | 20 ++++++++++++--------
> >  3 files changed, 35 insertions(+), 23 deletions(-)
> >
> > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> > index 1c31cd2691d32064613836141fbdeeebc831b21f..14f19cc2616452d7e6afbbaa52f8ad3e61a419e9 100644
> > --- a/include/linux/netdevice.h
> > +++ b/include/linux/netdevice.h
> > @@ -3249,21 +3249,6 @@ struct softnet_data {
> >         call_single_data_t      defer_csd;
> >  };
> >
> > -static inline void input_queue_head_incr(struct softnet_data *sd)
> > -{
> > -#ifdef CONFIG_RPS
> > -       sd->input_queue_head++;
> > -#endif
> > -}
> > -
> > -static inline void input_queue_tail_incr_save(struct softnet_data *sd,
> > -                                             unsigned int *qtail)
> > -{
> > -#ifdef CONFIG_RPS
> > -       *qtail = ++sd->input_queue_tail;
> > -#endif
> > -}
> > -
> >  DECLARE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
> >
> >  static inline int dev_recursion_level(void)
> > diff --git a/include/net/rps.h b/include/net/rps.h
> > index 7660243e905b92651a41292e04caf72c5f12f26e..10ca25731c1ef766715fe7ee415ad0b71ec643a8 100644
> > --- a/include/net/rps.h
> > +++ b/include/net/rps.h
> > @@ -122,4 +122,27 @@ static inline void sock_rps_record_flow(const struct sock *sk)
> >  #endif
> >  }
> >
> > +static inline u32 rps_input_queue_tail_incr(struct softnet_data *sd)
> > +{
> > +#ifdef CONFIG_RPS
> > +       return ++sd->input_queue_tail;
> > +#else
> > +       return 0;
> > +#endif
> > +}
> > +
> > +static inline void rps_input_queue_tail_save(u32 *dest, u32 tail)
> > +{
> > +#ifdef CONFIG_RPS
> > +       WRITE_ONCE(*dest, tail);
> > +#endif
> > +}
>
> I wonder if we should also call this new helper to WRITE_ONCE
> last_qtail in the set_rps_cpu()?
>

Absolutely, I have another patch series to address remaining races
(rflow->cpu, rflow->filter ...)

I chose to make a small one, to ease reviews.
Jason Xing March 30, 2024, 4:07 p.m. UTC | #3
On Sun, Mar 31, 2024 at 12:01 AM Eric Dumazet <edumazet@google.com> wrote:
>
> On Sat, Mar 30, 2024 at 3:47 PM Jason Xing <kerneljasonxing@gmail.com> wrote:
> >
> > Hello Eric,
> >
> > On Fri, Mar 29, 2024 at 11:43 PM Eric Dumazet <edumazet@google.com> wrote:
> > >
> > > input_queue_tail_incr_save() is incrementing the sd queue_tail
> > > and save it in the flow last_qtail.
> > >
> > > Two issues here :
> > >
> > > - no lock protects the write on last_qtail, we should use appropriate
> > >   annotations.
> > >
> > > - We can perform this write after releasing the per-cpu backlog lock,
> > >   to decrease this lock hold duration (move away the cache line miss)
> > >
> > > Also move input_queue_head_incr() and rps helpers to include/net/rps.h,
> > > while adding rps_ prefix to better reflect their role.
> > >
> > > v2: Fixed a build issue (Jakub and kernel build bots)
> > >
> > > Signed-off-by: Eric Dumazet <edumazet@google.com>
> > > ---
> > >  include/linux/netdevice.h | 15 ---------------
> > >  include/net/rps.h         | 23 +++++++++++++++++++++++
> > >  net/core/dev.c            | 20 ++++++++++++--------
> > >  3 files changed, 35 insertions(+), 23 deletions(-)
> > >
> > > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> > > index 1c31cd2691d32064613836141fbdeeebc831b21f..14f19cc2616452d7e6afbbaa52f8ad3e61a419e9 100644
> > > --- a/include/linux/netdevice.h
> > > +++ b/include/linux/netdevice.h
> > > @@ -3249,21 +3249,6 @@ struct softnet_data {
> > >         call_single_data_t      defer_csd;
> > >  };
> > >
> > > -static inline void input_queue_head_incr(struct softnet_data *sd)
> > > -{
> > > -#ifdef CONFIG_RPS
> > > -       sd->input_queue_head++;
> > > -#endif
> > > -}
> > > -
> > > -static inline void input_queue_tail_incr_save(struct softnet_data *sd,
> > > -                                             unsigned int *qtail)
> > > -{
> > > -#ifdef CONFIG_RPS
> > > -       *qtail = ++sd->input_queue_tail;
> > > -#endif
> > > -}
> > > -
> > >  DECLARE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
> > >
> > >  static inline int dev_recursion_level(void)
> > > diff --git a/include/net/rps.h b/include/net/rps.h
> > > index 7660243e905b92651a41292e04caf72c5f12f26e..10ca25731c1ef766715fe7ee415ad0b71ec643a8 100644
> > > --- a/include/net/rps.h
> > > +++ b/include/net/rps.h
> > > @@ -122,4 +122,27 @@ static inline void sock_rps_record_flow(const struct sock *sk)
> > >  #endif
> > >  }
> > >
> > > +static inline u32 rps_input_queue_tail_incr(struct softnet_data *sd)
> > > +{
> > > +#ifdef CONFIG_RPS
> > > +       return ++sd->input_queue_tail;
> > > +#else
> > > +       return 0;
> > > +#endif
> > > +}
> > > +
> > > +static inline void rps_input_queue_tail_save(u32 *dest, u32 tail)
> > > +{
> > > +#ifdef CONFIG_RPS
> > > +       WRITE_ONCE(*dest, tail);
> > > +#endif
> > > +}
> >
> > I wonder if we should also call this new helper to WRITE_ONCE
> > last_qtail in the set_rps_cpu()?
> >
>
> Absolutely, I have another patch series to address remaining races
> (rflow->cpu, rflow->filter ...)
>
> I chose to make a small one, to ease reviews.

Great. Now I have no more questions :)

Thanks,
Jason
Jason Xing April 12, 2024, 3:09 p.m. UTC | #4
On Sun, Mar 31, 2024 at 12:01 AM Eric Dumazet <edumazet@google.com> wrote:
>
> On Sat, Mar 30, 2024 at 3:47 PM Jason Xing <kerneljasonxing@gmail.com> wrote:
> >
> > Hello Eric,
> >
> > On Fri, Mar 29, 2024 at 11:43 PM Eric Dumazet <edumazet@google.com> wrote:
> > >
> > > input_queue_tail_incr_save() is incrementing the sd queue_tail
> > > and save it in the flow last_qtail.
> > >
> > > Two issues here :
> > >
> > > - no lock protects the write on last_qtail, we should use appropriate
> > >   annotations.
> > >
> > > - We can perform this write after releasing the per-cpu backlog lock,
> > >   to decrease this lock hold duration (move away the cache line miss)
> > >
> > > Also move input_queue_head_incr() and rps helpers to include/net/rps.h,
> > > while adding rps_ prefix to better reflect their role.
> > >
> > > v2: Fixed a build issue (Jakub and kernel build bots)
> > >
> > > Signed-off-by: Eric Dumazet <edumazet@google.com>
> > > ---
> > >  include/linux/netdevice.h | 15 ---------------
> > >  include/net/rps.h         | 23 +++++++++++++++++++++++
> > >  net/core/dev.c            | 20 ++++++++++++--------
> > >  3 files changed, 35 insertions(+), 23 deletions(-)
> > >
> > > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> > > index 1c31cd2691d32064613836141fbdeeebc831b21f..14f19cc2616452d7e6afbbaa52f8ad3e61a419e9 100644
> > > --- a/include/linux/netdevice.h
> > > +++ b/include/linux/netdevice.h
> > > @@ -3249,21 +3249,6 @@ struct softnet_data {
> > >         call_single_data_t      defer_csd;
> > >  };
> > >
> > > -static inline void input_queue_head_incr(struct softnet_data *sd)
> > > -{
> > > -#ifdef CONFIG_RPS
> > > -       sd->input_queue_head++;
> > > -#endif
> > > -}
> > > -
> > > -static inline void input_queue_tail_incr_save(struct softnet_data *sd,
> > > -                                             unsigned int *qtail)
> > > -{
> > > -#ifdef CONFIG_RPS
> > > -       *qtail = ++sd->input_queue_tail;
> > > -#endif
> > > -}
> > > -
> > >  DECLARE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
> > >
> > >  static inline int dev_recursion_level(void)
> > > diff --git a/include/net/rps.h b/include/net/rps.h
> > > index 7660243e905b92651a41292e04caf72c5f12f26e..10ca25731c1ef766715fe7ee415ad0b71ec643a8 100644
> > > --- a/include/net/rps.h
> > > +++ b/include/net/rps.h
> > > @@ -122,4 +122,27 @@ static inline void sock_rps_record_flow(const struct sock *sk)
> > >  #endif
> > >  }
> > >
> > > +static inline u32 rps_input_queue_tail_incr(struct softnet_data *sd)
> > > +{
> > > +#ifdef CONFIG_RPS
> > > +       return ++sd->input_queue_tail;
> > > +#else
> > > +       return 0;
> > > +#endif
> > > +}
> > > +
> > > +static inline void rps_input_queue_tail_save(u32 *dest, u32 tail)
> > > +{
> > > +#ifdef CONFIG_RPS
> > > +       WRITE_ONCE(*dest, tail);
> > > +#endif
> > > +}
> >
> > I wonder if we should also call this new helper to WRITE_ONCE
> > last_qtail in the set_rps_cpu()?
> >
>
> Absolutely, I have another patch series to address remaining races
> (rflow->cpu, rflow->filter ...)
>
> I chose to make a small one, to ease reviews.

Hello Eric,

I wonder if you already have a patchset to change those three members
in struct rps_dev_flow? I looked through this part and found it's not
that complicated. So if not, I can do it :)

Thanks,
Jason
diff mbox series

Patch

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 1c31cd2691d32064613836141fbdeeebc831b21f..14f19cc2616452d7e6afbbaa52f8ad3e61a419e9 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3249,21 +3249,6 @@  struct softnet_data {
 	call_single_data_t	defer_csd;
 };
 
-static inline void input_queue_head_incr(struct softnet_data *sd)
-{
-#ifdef CONFIG_RPS
-	sd->input_queue_head++;
-#endif
-}
-
-static inline void input_queue_tail_incr_save(struct softnet_data *sd,
-					      unsigned int *qtail)
-{
-#ifdef CONFIG_RPS
-	*qtail = ++sd->input_queue_tail;
-#endif
-}
-
 DECLARE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 
 static inline int dev_recursion_level(void)
diff --git a/include/net/rps.h b/include/net/rps.h
index 7660243e905b92651a41292e04caf72c5f12f26e..10ca25731c1ef766715fe7ee415ad0b71ec643a8 100644
--- a/include/net/rps.h
+++ b/include/net/rps.h
@@ -122,4 +122,27 @@  static inline void sock_rps_record_flow(const struct sock *sk)
 #endif
 }
 
+static inline u32 rps_input_queue_tail_incr(struct softnet_data *sd)
+{
+#ifdef CONFIG_RPS
+	return ++sd->input_queue_tail;
+#else
+	return 0;
+#endif
+}
+
+static inline void rps_input_queue_tail_save(u32 *dest, u32 tail)
+{
+#ifdef CONFIG_RPS
+	WRITE_ONCE(*dest, tail);
+#endif
+}
+
+static inline void rps_input_queue_head_incr(struct softnet_data *sd)
+{
+#ifdef CONFIG_RPS
+	sd->input_queue_head++;
+#endif
+}
+
 #endif /* _NET_RPS_H */
diff --git a/net/core/dev.c b/net/core/dev.c
index 0a8ccb0451c30a39f8f8b45d26b7e5548b8bfba4..79073bbc9a644049cacf8433310f4641745049e9 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4611,7 +4611,7 @@  static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 		if (unlikely(tcpu != next_cpu) &&
 		    (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
 		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
-		      rflow->last_qtail)) >= 0)) {
+		      READ_ONCE(rflow->last_qtail))) >= 0)) {
 			tcpu = next_cpu;
 			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
 		}
@@ -4666,7 +4666,7 @@  bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
 		cpu = READ_ONCE(rflow->cpu);
 		if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
 		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
-			   rflow->last_qtail) <
+			   READ_ONCE(rflow->last_qtail)) <
 		     (int)(10 * flow_table->mask)))
 			expire = false;
 	}
@@ -4801,6 +4801,7 @@  static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
 	unsigned long flags;
 	unsigned int qlen;
 	int max_backlog;
+	u32 tail;
 
 	reason = SKB_DROP_REASON_DEV_READY;
 	if (!netif_running(skb->dev))
@@ -4825,8 +4826,11 @@  static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
 				napi_schedule_rps(sd);
 		}
 		__skb_queue_tail(&sd->input_pkt_queue, skb);
-		input_queue_tail_incr_save(sd, qtail);
+		tail = rps_input_queue_tail_incr(sd);
 		backlog_unlock_irq_restore(sd, &flags);
+
+		/* save the tail outside of the critical section */
+		rps_input_queue_tail_save(qtail, tail);
 		return NET_RX_SUCCESS;
 	}
 
@@ -5904,7 +5908,7 @@  static void flush_backlog(struct work_struct *work)
 		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
 			__skb_unlink(skb, &sd->input_pkt_queue);
 			dev_kfree_skb_irq(skb);
-			input_queue_head_incr(sd);
+			rps_input_queue_head_incr(sd);
 		}
 	}
 	backlog_unlock_irq_enable(sd);
@@ -5913,7 +5917,7 @@  static void flush_backlog(struct work_struct *work)
 		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
 			__skb_unlink(skb, &sd->process_queue);
 			kfree_skb(skb);
-			input_queue_head_incr(sd);
+			rps_input_queue_head_incr(sd);
 		}
 	}
 	local_bh_enable();
@@ -6041,7 +6045,7 @@  static int process_backlog(struct napi_struct *napi, int quota)
 			rcu_read_lock();
 			__netif_receive_skb(skb);
 			rcu_read_unlock();
-			input_queue_head_incr(sd);
+			rps_input_queue_head_incr(sd);
 			if (++work >= quota)
 				return work;
 
@@ -11455,11 +11459,11 @@  static int dev_cpu_dead(unsigned int oldcpu)
 	/* Process offline CPU's input_pkt_queue */
 	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
 		netif_rx(skb);
-		input_queue_head_incr(oldsd);
+		rps_input_queue_head_incr(oldsd);
 	}
 	while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
 		netif_rx(skb);
-		input_queue_head_incr(oldsd);
+		rps_input_queue_head_incr(oldsd);
 	}
 
 	return 0;