diff mbox series

[net] net/sched: act_pedit: really ensure the skb is writable

Message ID 6c1230ee0f348230a833f92063ff2f5fbae58b94.1651584976.git.pabeni@redhat.com (mailing list archive)
State Superseded
Delegated to: Netdev Maintainers
Headers show
Series [net] net/sched: act_pedit: really ensure the skb is writable | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for net
netdev/fixes_present success Fixes tag present in non-next series
netdev/subject_prefix success Link
netdev/cover_letter success Single patches do not need cover letters
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 9 this patch: 9
netdev/cc_maintainers fail 2 blamed authors not CCed: davem@davemloft.net xiaosuo@gmail.com; 4 maintainers not CCed: davem@davemloft.net kuba@kernel.org xiaosuo@gmail.com edumazet@google.com
netdev/build_clang success Errors and warnings before: 9 this patch: 9
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/verify_fixes success Fixes tag looks correct
netdev/build_allmodconfig_warn success Errors and warnings before: 9 this patch: 9
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 50 lines checked
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Paolo Abeni May 3, 2022, 2:05 p.m. UTC
Currently pedit tries to ensure that the accessed skb offset
is writeble via skb_unclone(). The action potentially allows
touching any skb bytes, so it may end-up modifying shared data.

The above causes some sporadic MPTCP self-test failures.

Address the issue keeping track of a rough over-estimate highest skb
offset accessed by the action and ensure such offset is really
writable.

Note that this may cause performance regressions in some scenario,
but hopefully pedit is not critical path.

Fixes: db2c24175d14 ("act_pedit: access skb->data safely")
Acked-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Tested-by: Geliang Tang <geliang.tang@suse.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
Note: AFAICS the issue is present since 1da177e4c3f4
("Linux-2.6.12-rc2"), but before the "Fixes" commit this change
is irrelevant, because accessing any data out of the skb head
will cause an oops.
---
 include/net/tc_act/tc_pedit.h |  1 +
 net/sched/act_pedit.c         | 23 +++++++++++++++++++++--
 2 files changed, 22 insertions(+), 2 deletions(-)

Comments

Jamal Hadi Salim May 3, 2022, 8:10 p.m. UTC | #1
What was the tc pedit command that triggered this?
Can we add it to tdc tests?

cheers,
jamal

On 2022-05-03 10:05, Paolo Abeni wrote:
> Currently pedit tries to ensure that the accessed skb offset
> is writeble via skb_unclone(). The action potentially allows
> touching any skb bytes, so it may end-up modifying shared data.
> 
> The above causes some sporadic MPTCP self-test failures.
> 
> Address the issue keeping track of a rough over-estimate highest skb
> offset accessed by the action and ensure such offset is really
> writable.
> 
> Note that this may cause performance regressions in some scenario,
> but hopefully pedit is not critical path.
> 
> Fixes: db2c24175d14 ("act_pedit: access skb->data safely")
> Acked-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
> Tested-by: Geliang Tang <geliang.tang@suse.com>
> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> ---
> Note: AFAICS the issue is present since 1da177e4c3f4
> ("Linux-2.6.12-rc2"), but before the "Fixes" commit this change
> is irrelevant, because accessing any data out of the skb head
> will cause an oops.
> ---
>   include/net/tc_act/tc_pedit.h |  1 +
>   net/sched/act_pedit.c         | 23 +++++++++++++++++++++--
>   2 files changed, 22 insertions(+), 2 deletions(-)
> 
> diff --git a/include/net/tc_act/tc_pedit.h b/include/net/tc_act/tc_pedit.h
> index 748cf87a4d7e..3e02709a1df6 100644
> --- a/include/net/tc_act/tc_pedit.h
> +++ b/include/net/tc_act/tc_pedit.h
> @@ -14,6 +14,7 @@ struct tcf_pedit {
>   	struct tc_action	common;
>   	unsigned char		tcfp_nkeys;
>   	unsigned char		tcfp_flags;
> +	u32			tcfp_off_max_hint;
>   	struct tc_pedit_key	*tcfp_keys;
>   	struct tcf_pedit_key_ex	*tcfp_keys_ex;
>   };
> diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
> index 31fcd279c177..a8ab6c3f1ea2 100644
> --- a/net/sched/act_pedit.c
> +++ b/net/sched/act_pedit.c
> @@ -149,7 +149,7 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
>   	struct nlattr *pattr;
>   	struct tcf_pedit *p;
>   	int ret = 0, err;
> -	int ksize;
> +	int i, ksize;
>   	u32 index;
>   
>   	if (!nla) {
> @@ -228,6 +228,20 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
>   		p->tcfp_nkeys = parm->nkeys;
>   	}
>   	memcpy(p->tcfp_keys, parm->keys, ksize);
> +	p->tcfp_off_max_hint = 0;
> +	for (i = 0; i < p->tcfp_nkeys; ++i) {
> +		u32 cur = p->tcfp_keys[i].off;
> +
> +		/* The AT option can read a single byte, we can bound the actual
> +		 * value with uchar max. Each key touches 4 bytes starting from
> +		 * the computed offset
> +		 */
> +		if (p->tcfp_keys[i].offmask) {
> +			cur += 255 >> p->tcfp_keys[i].shift;
> +			cur = max(p->tcfp_keys[i].at, cur);
> +		}
> +		p->tcfp_off_max_hint = max(p->tcfp_off_max_hint, cur + 4);
> +	}
>   
>   	p->tcfp_flags = parm->flags;
>   	goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
> @@ -308,9 +322,14 @@ static int tcf_pedit_act(struct sk_buff *skb, const struct tc_action *a,
>   			 struct tcf_result *res)
>   {
>   	struct tcf_pedit *p = to_pedit(a);
> +	u32 max_offset;
>   	int i;
>   
> -	if (skb_unclone(skb, GFP_ATOMIC))
> +	max_offset = (skb_transport_header_was_set(skb) ?
> +		      skb_transport_offset(skb) :
> +		      skb_network_offset(skb)) +
> +		     p->tcfp_off_max_hint;
> +	if (skb_ensure_writable(skb, min(skb->len, max_offset)))
>   		return p->tcf_action;
>   
>   	spin_lock(&p->tcf_lock);
Paolo Abeni May 4, 2022, 8:52 a.m. UTC | #2
On Tue, 2022-05-03 at 16:10 -0400, Jamal Hadi Salim wrote:
> What was the tc pedit command that triggered this?

From the mptcp self-tests, mptcp_join.sh:

tc -n $ns2 filter add dev ns2eth$i egress \
		protocol ip prio 1000 \
		handle 42 fw \
		action pedit munge offset 148 u8 invert \
		pipe csum tcp \
		index 100 || exit 1

It's used to corrupt a packet so that TCP csum is still correct while
the MPTCP one is not.

The relevant part is that the touched offset is outside the skb head.

> Can we add it to tdc tests?

What happens in the mptcp self-tests it that an almost simultaneous
mptcp-level reinjection on another device using the same cloned data
get unintentionally corrupted and we catch it - when it sporadically
happens - via the MPTCP mibs.

While we could add the above pedit command, but I fear that a
meaningful test for the issue addressed here not fit the tdc
infrastructure easily.

Thanks,

Paolo
Jakub Kicinski May 4, 2022, 2:47 p.m. UTC | #3
On Wed, 04 May 2022 10:52:59 +0200 Paolo Abeni wrote:
> On Tue, 2022-05-03 at 16:10 -0400, Jamal Hadi Salim wrote:
> > What was the tc pedit command that triggered this?  
> 
> From the mptcp self-tests, mptcp_join.sh:
> 
> tc -n $ns2 filter add dev ns2eth$i egress \
> 		protocol ip prio 1000 \
> 		handle 42 fw \
> 		action pedit munge offset 148 u8 invert \
> 		pipe csum tcp \
> 		index 100 || exit 1
> 
> It's used to corrupt a packet so that TCP csum is still correct while
> the MPTCP one is not.
> 
> The relevant part is that the touched offset is outside the skb head.
> 
> > Can we add it to tdc tests?  
> 
> What happens in the mptcp self-tests it that an almost simultaneous
> mptcp-level reinjection on another device using the same cloned data
> get unintentionally corrupted and we catch it - when it sporadically
> happens - via the MPTCP mibs.
> 
> While we could add the above pedit command, but I fear that a
> meaningful test for the issue addressed here not fit the tdc
> infrastructure easily.

For testing stuff like this would it be possible to inject packets
with no headers pulled and frags in pages we marked read-only?
We can teach netdevsim to do it.

Obviously not as a pre-requisite for this patch.
Paolo Abeni May 4, 2022, 3:11 p.m. UTC | #4
On Wed, 2022-05-04 at 07:47 -0700, Jakub Kicinski wrote:
> On Wed, 04 May 2022 10:52:59 +0200 Paolo Abeni wrote:
> > On Tue, 2022-05-03 at 16:10 -0400, Jamal Hadi Salim wrote:
> > > What was the tc pedit command that triggered this?  
> > 
> > From the mptcp self-tests, mptcp_join.sh:
> > 
> > tc -n $ns2 filter add dev ns2eth$i egress \
> > 		protocol ip prio 1000 \
> > 		handle 42 fw \
> > 		action pedit munge offset 148 u8 invert \
> > 		pipe csum tcp \
> > 		index 100 || exit 1
> > 
> > It's used to corrupt a packet so that TCP csum is still correct while
> > the MPTCP one is not.
> > 
> > The relevant part is that the touched offset is outside the skb head.
> > 
> > > Can we add it to tdc tests?  
> > 
> > What happens in the mptcp self-tests it that an almost simultaneous
> > mptcp-level reinjection on another device using the same cloned data
> > get unintentionally corrupted and we catch it - when it sporadically
> > happens - via the MPTCP mibs.
> > 
> > While we could add the above pedit command, but I fear that a
> > meaningful test for the issue addressed here not fit the tdc
> > infrastructure easily.
> 
> For testing stuff like this would it be possible to inject packets
> with no headers pulled and frags in pages we marked read-only?
> We can teach netdevsim to do it.

We additionally need to ensure that the crafted packets are cloned,
otherwise the current code is AFAICS fine. And at the point we likely
want to configure the packet layout (hdrs/address) created by
netdevsim. 

> Obviously not as a pre-requisite for this patch.

I agree it looks a bit out-of-scope here ;)

Paolo
Jakub Kicinski May 4, 2022, 3:33 p.m. UTC | #5
On Wed, 04 May 2022 17:11:25 +0200 Paolo Abeni wrote:
> > For testing stuff like this would it be possible to inject packets
> > with no headers pulled and frags in pages we marked read-only?
> > We can teach netdevsim to do it.  
> 
> We additionally need to ensure that the crafted packets are cloned,
> otherwise the current code is AFAICS fine.

Depends whether skb frags are writable, or not. Or to put it differently
why does skb_cow_data() exist and does what it does.
Jakub Kicinski May 5, 2022, 3:04 a.m. UTC | #6
On Tue,  3 May 2022 16:05:42 +0200 Paolo Abeni wrote:
> Currently pedit tries to ensure that the accessed skb offset
> is writeble via skb_unclone(). The action potentially allows
> touching any skb bytes, so it may end-up modifying shared data.
> 
> The above causes some sporadic MPTCP self-test failures.
> 
> Address the issue keeping track of a rough over-estimate highest skb
> offset accessed by the action and ensure such offset is really
> writable.
> 
> Note that this may cause performance regressions in some scenario,
> but hopefully pedit is not critical path.
> 
> Fixes: db2c24175d14 ("act_pedit: access skb->data safely")
> Acked-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
> Tested-by: Geliang Tang <geliang.tang@suse.com>
> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> ---
> Note: AFAICS the issue is present since 1da177e4c3f4
> ("Linux-2.6.12-rc2"), but before the "Fixes" commit this change
> is irrelevant, because accessing any data out of the skb head
> will cause an oops.
> ---
>  include/net/tc_act/tc_pedit.h |  1 +
>  net/sched/act_pedit.c         | 23 +++++++++++++++++++++--
>  2 files changed, 22 insertions(+), 2 deletions(-)
> 
> diff --git a/include/net/tc_act/tc_pedit.h b/include/net/tc_act/tc_pedit.h
> index 748cf87a4d7e..3e02709a1df6 100644
> --- a/include/net/tc_act/tc_pedit.h
> +++ b/include/net/tc_act/tc_pedit.h
> @@ -14,6 +14,7 @@ struct tcf_pedit {
>  	struct tc_action	common;
>  	unsigned char		tcfp_nkeys;
>  	unsigned char		tcfp_flags;
> +	u32			tcfp_off_max_hint;
>  	struct tc_pedit_key	*tcfp_keys;
>  	struct tcf_pedit_key_ex	*tcfp_keys_ex;
>  };
> diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
> index 31fcd279c177..a8ab6c3f1ea2 100644
> --- a/net/sched/act_pedit.c
> +++ b/net/sched/act_pedit.c
> @@ -149,7 +149,7 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
>  	struct nlattr *pattr;
>  	struct tcf_pedit *p;
>  	int ret = 0, err;
> -	int ksize;
> +	int i, ksize;
>  	u32 index;
>  
>  	if (!nla) {
> @@ -228,6 +228,20 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
>  		p->tcfp_nkeys = parm->nkeys;
>  	}
>  	memcpy(p->tcfp_keys, parm->keys, ksize);
> +	p->tcfp_off_max_hint = 0;

This gets zeroed here... [1]

> +	for (i = 0; i < p->tcfp_nkeys; ++i) {
> +		u32 cur = p->tcfp_keys[i].off;
> +
> +		/* The AT option can read a single byte, we can bound the actual
> +		 * value with uchar max. Each key touches 4 bytes starting from
> +		 * the computed offset
> +		 */
> +		if (p->tcfp_keys[i].offmask) {
> +			cur += 255 >> p->tcfp_keys[i].shift;

Could be written as:

		cur += (0xff & p->tcfp_keys[i].offmask) >>
			p->tcfp_keys[i].shift;

without the if? That would be closer to the:

		offset += (*d & tkey->offmask) >> tkey->shift;

which ends up getting executed.

> +			cur = max(p->tcfp_keys[i].at, cur);

We never write under ->at, tho, so this shouldn't be needed?

> +		}
> +		p->tcfp_off_max_hint = max(p->tcfp_off_max_hint, cur + 4);
> +	}
>  
>  	p->tcfp_flags = parm->flags;
>  	goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
> @@ -308,9 +322,14 @@ static int tcf_pedit_act(struct sk_buff *skb, const struct tc_action *a,
>  			 struct tcf_result *res)
>  {
>  	struct tcf_pedit *p = to_pedit(a);
> +	u32 max_offset;
>  	int i;
>  
> -	if (skb_unclone(skb, GFP_ATOMIC))
> +	max_offset = (skb_transport_header_was_set(skb) ?
> +		      skb_transport_offset(skb) :
> +		      skb_network_offset(skb)) +
> +		     p->tcfp_off_max_hint;

[1] ... and used here outside of the lock. Isn't it racy?

> +	if (skb_ensure_writable(skb, min(skb->len, max_offset)))
>  		return p->tcf_action;
>  
>  	spin_lock(&p->tcf_lock);
Paolo Abeni May 5, 2022, 2:13 p.m. UTC | #7
On Wed, 2022-05-04 at 20:04 -0700, Jakub Kicinski wrote:
> On Tue,  3 May 2022 16:05:42 +0200 Paolo Abeni wrote:
> > Currently pedit tries to ensure that the accessed skb offset
> > is writeble via skb_unclone(). The action potentially allows
> > touching any skb bytes, so it may end-up modifying shared data.
> > 
> > The above causes some sporadic MPTCP self-test failures.
> > 
> > Address the issue keeping track of a rough over-estimate highest skb
> > offset accessed by the action and ensure such offset is really
> > writable.
> > 
> > Note that this may cause performance regressions in some scenario,
> > but hopefully pedit is not critical path.
> > 
> > Fixes: db2c24175d14 ("act_pedit: access skb->data safely")
> > Acked-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
> > Tested-by: Geliang Tang <geliang.tang@suse.com>
> > Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> > ---
> > Note: AFAICS the issue is present since 1da177e4c3f4
> > ("Linux-2.6.12-rc2"), but before the "Fixes" commit this change
> > is irrelevant, because accessing any data out of the skb head
> > will cause an oops.
> > ---
> >  include/net/tc_act/tc_pedit.h |  1 +
> >  net/sched/act_pedit.c         | 23 +++++++++++++++++++++--
> >  2 files changed, 22 insertions(+), 2 deletions(-)
> > 
> > diff --git a/include/net/tc_act/tc_pedit.h b/include/net/tc_act/tc_pedit.h
> > index 748cf87a4d7e..3e02709a1df6 100644
> > --- a/include/net/tc_act/tc_pedit.h
> > +++ b/include/net/tc_act/tc_pedit.h
> > @@ -14,6 +14,7 @@ struct tcf_pedit {
> >  	struct tc_action	common;
> >  	unsigned char		tcfp_nkeys;
> >  	unsigned char		tcfp_flags;
> > +	u32			tcfp_off_max_hint;
> >  	struct tc_pedit_key	*tcfp_keys;
> >  	struct tcf_pedit_key_ex	*tcfp_keys_ex;
> >  };
> > diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
> > index 31fcd279c177..a8ab6c3f1ea2 100644
> > --- a/net/sched/act_pedit.c
> > +++ b/net/sched/act_pedit.c
> > @@ -149,7 +149,7 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
> >  	struct nlattr *pattr;
> >  	struct tcf_pedit *p;
> >  	int ret = 0, err;
> > -	int ksize;
> > +	int i, ksize;
> >  	u32 index;
> >  
> >  	if (!nla) {
> > @@ -228,6 +228,20 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
> >  		p->tcfp_nkeys = parm->nkeys;
> >  	}
> >  	memcpy(p->tcfp_keys, parm->keys, ksize);
> > +	p->tcfp_off_max_hint = 0;
> 
> This gets zeroed here... [1]
> 
> > +	for (i = 0; i < p->tcfp_nkeys; ++i) {
> > +		u32 cur = p->tcfp_keys[i].off;
> > +
> > +		/* The AT option can read a single byte, we can bound the actual
> > +		 * value with uchar max. Each key touches 4 bytes starting from
> > +		 * the computed offset
> > +		 */
> > +		if (p->tcfp_keys[i].offmask) {
> > +			cur += 255 >> p->tcfp_keys[i].shift;
> 
> Could be written as:
> 
> 		cur += (0xff & p->tcfp_keys[i].offmask) >>
> 			p->tcfp_keys[i].shift;
> 
> without the if? That would be closer to the:
> 
> 		offset += (*d & tkey->offmask) >> tkey->shift;
> 
> which ends up getting executed.
> 
> > +			cur = max(p->tcfp_keys[i].at, cur);
> 
> We never write under ->at, tho, so this shouldn't be needed?

Every thing you mentioned looks correct, I'll take care in v2.

> 
> > +		}
> > +		p->tcfp_off_max_hint = max(p->tcfp_off_max_hint, cur + 4);
> > +	}
> >  
> >  	p->tcfp_flags = parm->flags;
> >  	goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
> > @@ -308,9 +322,14 @@ static int tcf_pedit_act(struct sk_buff *skb, const struct tc_action *a,
> >  			 struct tcf_result *res)
> >  {
> >  	struct tcf_pedit *p = to_pedit(a);
> > +	u32 max_offset;
> >  	int i;
> >  
> > -	if (skb_unclone(skb, GFP_ATOMIC))
> > +	max_offset = (skb_transport_header_was_set(skb) ?
> > +		      skb_transport_offset(skb) :
> > +		      skb_network_offset(skb)) +
> > +		     p->tcfp_off_max_hint;
> 
> [1] ... and used here outside of the lock. Isn't it racy?

Indeed it is. I'll fix in v2 extending the lock over here (including
the allocation, sigh).

Later, for net-next I think we could refactor the code to avoid the
lock in the data path with something alike c749cdda9089 ("net/sched:
act_skbedit: don't use spinlock in the data path")


Thanks!

Paolo
diff mbox series

Patch

diff --git a/include/net/tc_act/tc_pedit.h b/include/net/tc_act/tc_pedit.h
index 748cf87a4d7e..3e02709a1df6 100644
--- a/include/net/tc_act/tc_pedit.h
+++ b/include/net/tc_act/tc_pedit.h
@@ -14,6 +14,7 @@  struct tcf_pedit {
 	struct tc_action	common;
 	unsigned char		tcfp_nkeys;
 	unsigned char		tcfp_flags;
+	u32			tcfp_off_max_hint;
 	struct tc_pedit_key	*tcfp_keys;
 	struct tcf_pedit_key_ex	*tcfp_keys_ex;
 };
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index 31fcd279c177..a8ab6c3f1ea2 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -149,7 +149,7 @@  static int tcf_pedit_init(struct net *net, struct nlattr *nla,
 	struct nlattr *pattr;
 	struct tcf_pedit *p;
 	int ret = 0, err;
-	int ksize;
+	int i, ksize;
 	u32 index;
 
 	if (!nla) {
@@ -228,6 +228,20 @@  static int tcf_pedit_init(struct net *net, struct nlattr *nla,
 		p->tcfp_nkeys = parm->nkeys;
 	}
 	memcpy(p->tcfp_keys, parm->keys, ksize);
+	p->tcfp_off_max_hint = 0;
+	for (i = 0; i < p->tcfp_nkeys; ++i) {
+		u32 cur = p->tcfp_keys[i].off;
+
+		/* The AT option can read a single byte, we can bound the actual
+		 * value with uchar max. Each key touches 4 bytes starting from
+		 * the computed offset
+		 */
+		if (p->tcfp_keys[i].offmask) {
+			cur += 255 >> p->tcfp_keys[i].shift;
+			cur = max(p->tcfp_keys[i].at, cur);
+		}
+		p->tcfp_off_max_hint = max(p->tcfp_off_max_hint, cur + 4);
+	}
 
 	p->tcfp_flags = parm->flags;
 	goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
@@ -308,9 +322,14 @@  static int tcf_pedit_act(struct sk_buff *skb, const struct tc_action *a,
 			 struct tcf_result *res)
 {
 	struct tcf_pedit *p = to_pedit(a);
+	u32 max_offset;
 	int i;
 
-	if (skb_unclone(skb, GFP_ATOMIC))
+	max_offset = (skb_transport_header_was_set(skb) ?
+		      skb_transport_offset(skb) :
+		      skb_network_offset(skb)) +
+		     p->tcfp_off_max_hint;
+	if (skb_ensure_writable(skb, min(skb->len, max_offset)))
 		return p->tcf_action;
 
 	spin_lock(&p->tcf_lock);