diff mbox series

[ipsec-next,2/3] xfrm: interface: Add unstable helpers for setting/getting XFRM metadata from TC-BPF

Message ID 20221128160501.769892-3-eyal.birger@gmail.com (mailing list archive)
State New
Headers show
Series xfrm: interface: Add unstable helpers for XFRM metadata | expand

Commit Message

Eyal Birger Nov. 28, 2022, 4:05 p.m. UTC
This change adds xfrm metadata helpers using the unstable kfunc call
interface for the TC-BPF hooks. This allows steering traffic towards
different IPsec connections based on logic implemented in bpf programs.

This object is built based on the availabilty of BTF debug info.

The metadata percpu dsts used on TX take ownership of the original skb
dsts so that they may be used as part of the xfrm transmittion logic -
e.g.  for MTU calculations.

Signed-off-by: Eyal Birger <eyal.birger@gmail.com>
---
 include/net/dst_metadata.h     |  1 +
 include/net/xfrm.h             | 20 ++++++++
 net/core/dst.c                 |  4 ++
 net/xfrm/Makefile              |  6 +++
 net/xfrm/xfrm_interface_bpf.c  | 92 ++++++++++++++++++++++++++++++++++
 net/xfrm/xfrm_interface_core.c | 15 ++++++
 6 files changed, 138 insertions(+)
 create mode 100644 net/xfrm/xfrm_interface_bpf.c

Comments

Martin KaFai Lau Nov. 29, 2022, 1:58 a.m. UTC | #1
On 11/28/22 8:05 AM, Eyal Birger wrote:
> This change adds xfrm metadata helpers using the unstable kfunc call
> interface for the TC-BPF hooks. This allows steering traffic towards
> different IPsec connections based on logic implemented in bpf programs.
> 
> This object is built based on the availabilty of BTF debug info.
> 
> The metadata percpu dsts used on TX take ownership of the original skb
> dsts so that they may be used as part of the xfrm transmittion logic -
> e.g.  for MTU calculations.

A few quick comments and questions:

> 
> Signed-off-by: Eyal Birger <eyal.birger@gmail.com>
> ---
>   include/net/dst_metadata.h     |  1 +
>   include/net/xfrm.h             | 20 ++++++++
>   net/core/dst.c                 |  4 ++
>   net/xfrm/Makefile              |  6 +++
>   net/xfrm/xfrm_interface_bpf.c  | 92 ++++++++++++++++++++++++++++++++++

Please tag for bpf-next

>   net/xfrm/xfrm_interface_core.c | 15 ++++++
>   6 files changed, 138 insertions(+)
>   create mode 100644 net/xfrm/xfrm_interface_bpf.c
> 
> diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h
> index a454cf4327fe..1b7fae4c6b24 100644
> --- a/include/net/dst_metadata.h
> +++ b/include/net/dst_metadata.h
> @@ -26,6 +26,7 @@ struct macsec_info {
>   struct xfrm_md_info {
>   	u32 if_id;
>   	int link;
> +	struct dst_entry *dst_orig;
>   };
>   
>   struct metadata_dst {

[ ... ]

> diff --git a/net/core/dst.c b/net/core/dst.c
> index bc9c9be4e080..4c2eb7e56dab 100644
> --- a/net/core/dst.c
> +++ b/net/core/dst.c
> @@ -315,6 +315,8 @@ void metadata_dst_free(struct metadata_dst *md_dst)
>   #ifdef CONFIG_DST_CACHE
>   	if (md_dst->type == METADATA_IP_TUNNEL)
>   		dst_cache_destroy(&md_dst->u.tun_info.dst_cache);
> +	else if (md_dst->type == METADATA_XFRM)
> +		dst_release(md_dst->u.xfrm_info.dst_orig);

Why only release dst_orig under CONFIG_DST_CACHE?

>   #endif
>   	kfree(md_dst);
>   }
> @@ -348,6 +350,8 @@ void metadata_dst_free_percpu(struct metadata_dst __percpu *md_dst)
>   
>   		if (one_md_dst->type == METADATA_IP_TUNNEL)
>   			dst_cache_destroy(&one_md_dst->u.tun_info.dst_cache);
> +		else if (one_md_dst->type == METADATA_XFRM)
> +			dst_release(one_md_dst->u.xfrm_info.dst_orig);

Same here.

[ ... ]

> diff --git a/net/xfrm/xfrm_interface_bpf.c b/net/xfrm/xfrm_interface_bpf.c
> new file mode 100644
> index 000000000000..d3997ab7cc28
> --- /dev/null
> +++ b/net/xfrm/xfrm_interface_bpf.c
> @@ -0,0 +1,92 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/* Unstable XFRM Helpers for TC-BPF hook
> + *
> + * These are called from SCHED_CLS BPF programs. Note that it is
> + * allowed to break compatibility for these functions since the interface they
> + * are exposed through to BPF programs is explicitly unstable.
> + */
> +
> +#include <linux/bpf.h>
> +#include <linux/btf_ids.h>
> +
> +#include <net/dst_metadata.h>
> +#include <net/xfrm.h>
> +
> +struct bpf_xfrm_info {
> +	u32 if_id;
> +	int link;
> +};
> +
> +static struct metadata_dst __percpu *xfrm_md_dst;
> +__diag_push();
> +__diag_ignore_all("-Wmissing-prototypes",
> +		  "Global functions as their definitions will be in xfrm_interface BTF");
> +
> +__used noinline
> +int bpf_skb_get_xfrm_info(struct __sk_buff *skb_ctx, struct bpf_xfrm_info *to)
> +{
> +	struct sk_buff *skb = (struct sk_buff *)skb_ctx;
> +	struct xfrm_md_info *info;
> +
> +	memset(to, 0, sizeof(*to));
> +
> +	info = skb_xfrm_md_info(skb);
> +	if (!info)
> +		return -EINVAL;
> +
> +	to->if_id = info->if_id;
> +	to->link = info->link;
> +	return 0;
> +}
> +
> +__used noinline
> +int bpf_skb_set_xfrm_info(struct __sk_buff *skb_ctx,
> +			  const struct bpf_xfrm_info *from)
> +{
> +	struct sk_buff *skb = (struct sk_buff *)skb_ctx;
> +	struct metadata_dst *md_dst;
> +	struct xfrm_md_info *info;
> +
> +	if (unlikely(skb_metadata_dst(skb)))
> +		return -EINVAL;
> +
> +	md_dst = this_cpu_ptr(xfrm_md_dst);
> +
> +	info = &md_dst->u.xfrm_info;
> +	memset(info, 0, sizeof(*info));
> +
> +	info->if_id = from->if_id;
> +	info->link = from->link;
> +	info->dst_orig = skb_dst(skb);
However, the dst_orig init is not done under CONFIG_DST_CACHE though...

Also, is it possible that skb->_skb_refdst has SKB_DST_NOREF set and later below 
... (contd)

> +
> +	dst_hold((struct dst_entry *)md_dst);
> +	skb_dst_set(skb, (struct dst_entry *)md_dst);
> +	return 0;
> +}
> +
> +__diag_pop()
> +
> +BTF_SET8_START(xfrm_ifc_kfunc_set)
> +BTF_ID_FLAGS(func, bpf_skb_get_xfrm_info)
> +BTF_ID_FLAGS(func, bpf_skb_set_xfrm_info)
> +BTF_SET8_END(xfrm_ifc_kfunc_set)
> +
> +static const struct btf_kfunc_id_set xfrm_interface_kfunc_set = {
> +	.owner = THIS_MODULE,
> +	.set   = &xfrm_ifc_kfunc_set,
> +};
> +
> +int __init register_xfrm_interface_bpf(void)
> +{
> +	xfrm_md_dst = metadata_dst_alloc_percpu(0, METADATA_XFRM,
> +						GFP_KERNEL);
> +	if (!xfrm_md_dst)
> +		return -ENOMEM;
> +	return register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS,
> +					 &xfrm_interface_kfunc_set);

Will cleanup_xfrm_interface_bpf() be called during error ?

> +}
> +
> +void __exit cleanup_xfrm_interface_bpf(void)
> +{
> +	metadata_dst_free_percpu(xfrm_md_dst);
> +}
> diff --git a/net/xfrm/xfrm_interface_core.c b/net/xfrm/xfrm_interface_core.c
> index 5a67b120c4db..1e1e8e965939 100644
> --- a/net/xfrm/xfrm_interface_core.c
> +++ b/net/xfrm/xfrm_interface_core.c
> @@ -396,6 +396,14 @@ xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl)
>   
>   		if_id = md_info->if_id;
>   		fl->flowi_oif = md_info->link;
> +		if (md_info->dst_orig) {
> +			struct dst_entry *tmp_dst = dst;
> +
> +			dst = md_info->dst_orig;
> +			skb_dst_set(skb, dst);

(contd) ... skb_dst_set() is always called here.  (considering there is 
skb_dst_set_noref()).


> +			md_info->dst_orig = NULL;
> +			dst_release(tmp_dst);
> +		}
>   	} else {
>   		if_id = xi->p.if_id;
>   	}
> @@ -1162,12 +1170,18 @@ static int __init xfrmi_init(void)
>   	if (err < 0)
>   		goto rtnl_link_failed;
>   
> +	err = register_xfrm_interface_bpf();
> +	if (err < 0)
> +		goto kfunc_failed;
> +
>   	lwtunnel_encap_add_ops(&xfrmi_encap_ops, LWTUNNEL_ENCAP_XFRM);
>   
>   	xfrm_if_register_cb(&xfrm_if_cb);
>   
>   	return err;
>   
> +kfunc_failed:
> +	rtnl_link_unregister(&xfrmi_link_ops);
>   rtnl_link_failed:
>   	xfrmi6_fini();
>   xfrmi6_failed:
> @@ -1183,6 +1197,7 @@ static void __exit xfrmi_fini(void)
>   {
>   	xfrm_if_unregister_cb();
>   	lwtunnel_encap_del_ops(&xfrmi_encap_ops, LWTUNNEL_ENCAP_XFRM);
> +	cleanup_xfrm_interface_bpf();
>   	rtnl_link_unregister(&xfrmi_link_ops);
>   	xfrmi4_fini();
>   	xfrmi6_fini();
Eyal Birger Nov. 29, 2022, 8:36 a.m. UTC | #2
(sent again in plain text, sorry for the noise).

Hi Martin.

On Tue, Nov 29, 2022 at 3:58 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>
> On 11/28/22 8:05 AM, Eyal Birger wrote:
> > This change adds xfrm metadata helpers using the unstable kfunc call
> > interface for the TC-BPF hooks. This allows steering traffic towards
> > different IPsec connections based on logic implemented in bpf programs.
> >
> > This object is built based on the availabilty of BTF debug info.
> >
> > The metadata percpu dsts used on TX take ownership of the original skb
> > dsts so that they may be used as part of the xfrm transmittion logic -
> > e.g.  for MTU calculations.
>
> A few quick comments and questions:

Thanks for your comments!

>
> >
> > Signed-off-by: Eyal Birger <eyal.birger@gmail.com>
> > ---
> >   include/net/dst_metadata.h     |  1 +
> >   include/net/xfrm.h             | 20 ++++++++
> >   net/core/dst.c                 |  4 ++
> >   net/xfrm/Makefile              |  6 +++
> >   net/xfrm/xfrm_interface_bpf.c  | 92 ++++++++++++++++++++++++++++++++++
>
> Please tag for bpf-next
Sure. I wasn't totally sure which tree this belongs to.

>
> >   net/xfrm/xfrm_interface_core.c | 15 ++++++
> >   6 files changed, 138 insertions(+)
> >   create mode 100644 net/xfrm/xfrm_interface_bpf.c
> >
> > diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h
> > index a454cf4327fe..1b7fae4c6b24 100644
> > --- a/include/net/dst_metadata.h
> > +++ b/include/net/dst_metadata.h
> > @@ -26,6 +26,7 @@ struct macsec_info {
> >   struct xfrm_md_info {
> >       u32 if_id;
> >       int link;
> > +     struct dst_entry *dst_orig;
> >   };
> >
> >   struct metadata_dst {
>
> [ ... ]
>
> > diff --git a/net/core/dst.c b/net/core/dst.c
> > index bc9c9be4e080..4c2eb7e56dab 100644
> > --- a/net/core/dst.c
> > +++ b/net/core/dst.c
> > @@ -315,6 +315,8 @@ void metadata_dst_free(struct metadata_dst *md_dst)
> >   #ifdef CONFIG_DST_CACHE
> >       if (md_dst->type == METADATA_IP_TUNNEL)
> >               dst_cache_destroy(&md_dst->u.tun_info.dst_cache);
> > +     else if (md_dst->type == METADATA_XFRM)
> > +             dst_release(md_dst->u.xfrm_info.dst_orig);
>
> Why only release dst_orig under CONFIG_DST_CACHE?
It's a relic from a previous version where I'd used dst cache.
Will move out of this ifdef.

>
> >   #endif
> >       kfree(md_dst);
> >   }
> > @@ -348,6 +350,8 @@ void metadata_dst_free_percpu(struct metadata_dst __percpu *md_dst)
> >
> >               if (one_md_dst->type == METADATA_IP_TUNNEL)
> >                       dst_cache_destroy(&one_md_dst->u.tun_info.dst_cache);
> > +             else if (one_md_dst->type == METADATA_XFRM)
> > +                     dst_release(one_md_dst->u.xfrm_info.dst_orig);
>
> Same here.

Likewise.

>
> [ ... ]
>
> > diff --git a/net/xfrm/xfrm_interface_bpf.c b/net/xfrm/xfrm_interface_bpf.c
> > new file mode 100644
> > index 000000000000..d3997ab7cc28
> > --- /dev/null
> > +++ b/net/xfrm/xfrm_interface_bpf.c
> > @@ -0,0 +1,92 @@
> > +// SPDX-License-Identifier: GPL-2.0-only
> > +/* Unstable XFRM Helpers for TC-BPF hook
> > + *
> > + * These are called from SCHED_CLS BPF programs. Note that it is
> > + * allowed to break compatibility for these functions since the interface they
> > + * are exposed through to BPF programs is explicitly unstable.
> > + */
> > +
> > +#include <linux/bpf.h>
> > +#include <linux/btf_ids.h>
> > +
> > +#include <net/dst_metadata.h>
> > +#include <net/xfrm.h>
> > +
> > +struct bpf_xfrm_info {
> > +     u32 if_id;
> > +     int link;
> > +};
> > +
> > +static struct metadata_dst __percpu *xfrm_md_dst;
> > +__diag_push();
> > +__diag_ignore_all("-Wmissing-prototypes",
> > +               "Global functions as their definitions will be in xfrm_interface BTF");
> > +
> > +__used noinline
> > +int bpf_skb_get_xfrm_info(struct __sk_buff *skb_ctx, struct bpf_xfrm_info *to)
> > +{
> > +     struct sk_buff *skb = (struct sk_buff *)skb_ctx;
> > +     struct xfrm_md_info *info;
> > +
> > +     memset(to, 0, sizeof(*to));
> > +
> > +     info = skb_xfrm_md_info(skb);
> > +     if (!info)
> > +             return -EINVAL;
> > +
> > +     to->if_id = info->if_id;
> > +     to->link = info->link;
> > +     return 0;
> > +}
> > +
> > +__used noinline
> > +int bpf_skb_set_xfrm_info(struct __sk_buff *skb_ctx,
> > +                       const struct bpf_xfrm_info *from)
> > +{
> > +     struct sk_buff *skb = (struct sk_buff *)skb_ctx;
> > +     struct metadata_dst *md_dst;
> > +     struct xfrm_md_info *info;
> > +
> > +     if (unlikely(skb_metadata_dst(skb)))
> > +             return -EINVAL;
> > +
> > +     md_dst = this_cpu_ptr(xfrm_md_dst);
> > +
> > +     info = &md_dst->u.xfrm_info;
> > +     memset(info, 0, sizeof(*info));
> > +
> > +     info->if_id = from->if_id;
> > +     info->link = from->link;
> > +     info->dst_orig = skb_dst(skb);
> However, the dst_orig init is not done under CONFIG_DST_CACHE though...
>
> Also, is it possible that skb->_skb_refdst has SKB_DST_NOREF set and later below
> ... (contd)
Nice catch! will force dst is refcounted.

>
> > +
> > +     dst_hold((struct dst_entry *)md_dst);
> > +     skb_dst_set(skb, (struct dst_entry *)md_dst);
> > +     return 0;
> > +}
> > +
> > +__diag_pop()
> > +
> > +BTF_SET8_START(xfrm_ifc_kfunc_set)
> > +BTF_ID_FLAGS(func, bpf_skb_get_xfrm_info)
> > +BTF_ID_FLAGS(func, bpf_skb_set_xfrm_info)
> > +BTF_SET8_END(xfrm_ifc_kfunc_set)
> > +
> > +static const struct btf_kfunc_id_set xfrm_interface_kfunc_set = {
> > +     .owner = THIS_MODULE,
> > +     .set   = &xfrm_ifc_kfunc_set,
> > +};
> > +
> > +int __init register_xfrm_interface_bpf(void)
> > +{
> > +     xfrm_md_dst = metadata_dst_alloc_percpu(0, METADATA_XFRM,
> > +                                             GFP_KERNEL);
> > +     if (!xfrm_md_dst)
> > +             return -ENOMEM;
> > +     return register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS,
> > +                                      &xfrm_interface_kfunc_set);
>
> Will cleanup_xfrm_interface_bpf() be called during error ?
No. Will fix in v2.

Thanks!
Eyal.
Steffen Klassert Nov. 29, 2022, 9:50 a.m. UTC | #3
On Mon, Nov 28, 2022 at 05:58:23PM -0800, Martin KaFai Lau wrote:
> On 11/28/22 8:05 AM, Eyal Birger wrote:
> > This change adds xfrm metadata helpers using the unstable kfunc call
> > interface for the TC-BPF hooks. This allows steering traffic towards
> > different IPsec connections based on logic implemented in bpf programs.
> > 
> > This object is built based on the availabilty of BTF debug info.
> > 
> > The metadata percpu dsts used on TX take ownership of the original skb
> > dsts so that they may be used as part of the xfrm transmittion logic -
> > e.g.  for MTU calculations.
> 
> A few quick comments and questions:
> 
> > 
> > Signed-off-by: Eyal Birger <eyal.birger@gmail.com>
> > ---
> >   include/net/dst_metadata.h     |  1 +
> >   include/net/xfrm.h             | 20 ++++++++
> >   net/core/dst.c                 |  4 ++
> >   net/xfrm/Makefile              |  6 +++
> >   net/xfrm/xfrm_interface_bpf.c  | 92 ++++++++++++++++++++++++++++++++++
> 
> Please tag for bpf-next

This is a change to xfrm ipsec, so it should go
through the ipsec-next tree, unless there is
a good reason for handling that different.
Jakub Kicinski Nov. 29, 2022, 4:15 p.m. UTC | #4
On Tue, 29 Nov 2022 10:50:01 +0100 Steffen Klassert wrote:
> > Please tag for bpf-next  
> 
> This is a change to xfrm ipsec, so it should go
> through the ipsec-next tree, unless there is
> a good reason for handling that different.

Yeah, this is borderline. Do the patches apply cleanly to Linus's tree?
If so maybe they can be posted as a PR and both trees can pull them in,
avoiding any unnecessary back and forth...
Steffen Klassert Nov. 30, 2022, 8:39 a.m. UTC | #5
On Tue, Nov 29, 2022 at 08:15:10AM -0800, Jakub Kicinski wrote:
> On Tue, 29 Nov 2022 10:50:01 +0100 Steffen Klassert wrote:
> > > Please tag for bpf-next  
> > 
> > This is a change to xfrm ipsec, so it should go
> > through the ipsec-next tree, unless there is
> > a good reason for handling that different.
> 
> Yeah, this is borderline. Do the patches apply cleanly to Linus's tree?
> If so maybe they can be posted as a PR and both trees can pull them in,
> avoiding any unnecessary back and forth...

Now, after the last PR was merged, the ipsec-next tree is empty,
and we are close to the end of this development cycle. So I don't
expect conflicts if that patchset goes in before the merge window.
If the bpf-next tree wants to take the v2 version, just let me
know. Otherwise I consider taking it into ipsec-next.
Martin KaFai Lau Nov. 30, 2022, 7:10 p.m. UTC | #6
On 11/29/22 8:15 AM, Jakub Kicinski wrote:
> On Tue, 29 Nov 2022 10:50:01 +0100 Steffen Klassert wrote:
>>> Please tag for bpf-next
>>
>> This is a change to xfrm ipsec, so it should go
>> through the ipsec-next tree, unless there is
>> a good reason for handling that different.

The set is mostly depending on the bpf features.  Patch 2 is mostly depending on 
bpf and patch 3 is also a bpf selftest.  I assume the set should have been 
developed based on the bpf-next tree instead.  It is also good to have the test 
run in bpf CI sooner than later to bar on-going bpf changes that may break it. 
It is the reason I think bpf-next makes more sense.

If it is preferred to go through ipsec-next, the set should at least be tested 
against the bpf-next before posting.

https://patchwork.kernel.org/project/netdevbpf/patch/20221129132018.985887-4-eyal.birger@gmail.com/
Steffen Klassert Dec. 1, 2022, 7:12 a.m. UTC | #7
On Wed, Nov 30, 2022 at 11:10:13AM -0800, Martin KaFai Lau wrote:
> On 11/29/22 8:15 AM, Jakub Kicinski wrote:
> > On Tue, 29 Nov 2022 10:50:01 +0100 Steffen Klassert wrote:
> > > > Please tag for bpf-next
> > > 
> > > This is a change to xfrm ipsec, so it should go
> > > through the ipsec-next tree, unless there is
> > > a good reason for handling that different.
> 
> The set is mostly depending on the bpf features.  Patch 2 is mostly
> depending on bpf and patch 3 is also a bpf selftest.  I assume the set
> should have been developed based on the bpf-next tree instead.  It is also
> good to have the test run in bpf CI sooner than later to bar on-going bpf
> changes that may break it. It is the reason I think bpf-next makes more
> sense.

As said, if there is a good reason, I'm ok with routing it
through bpf-next. Looks like there is a good readon, so
go with bpf-next.
diff mbox series

Patch

diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h
index a454cf4327fe..1b7fae4c6b24 100644
--- a/include/net/dst_metadata.h
+++ b/include/net/dst_metadata.h
@@ -26,6 +26,7 @@  struct macsec_info {
 struct xfrm_md_info {
 	u32 if_id;
 	int link;
+	struct dst_entry *dst_orig;
 };
 
 struct metadata_dst {
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index e0cc6791c001..5e5fea3087b6 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -2086,4 +2086,24 @@  static inline bool xfrm6_local_dontfrag(const struct sock *sk)
 	return false;
 }
 #endif
+
+#if (IS_BUILTIN(CONFIG_XFRM_INTERFACE) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) || \
+    (IS_MODULE(CONFIG_XFRM_INTERFACE) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES))
+
+extern int register_xfrm_interface_bpf(void);
+extern void cleanup_xfrm_interface_bpf(void);
+
+#else
+
+static inline int register_xfrm_interface_bpf(void)
+{
+	return 0;
+}
+
+static inline void cleanup_xfrm_interface_bpf(void)
+{
+}
+
+#endif
+
 #endif	/* _NET_XFRM_H */
diff --git a/net/core/dst.c b/net/core/dst.c
index bc9c9be4e080..4c2eb7e56dab 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -315,6 +315,8 @@  void metadata_dst_free(struct metadata_dst *md_dst)
 #ifdef CONFIG_DST_CACHE
 	if (md_dst->type == METADATA_IP_TUNNEL)
 		dst_cache_destroy(&md_dst->u.tun_info.dst_cache);
+	else if (md_dst->type == METADATA_XFRM)
+		dst_release(md_dst->u.xfrm_info.dst_orig);
 #endif
 	kfree(md_dst);
 }
@@ -348,6 +350,8 @@  void metadata_dst_free_percpu(struct metadata_dst __percpu *md_dst)
 
 		if (one_md_dst->type == METADATA_IP_TUNNEL)
 			dst_cache_destroy(&one_md_dst->u.tun_info.dst_cache);
+		else if (one_md_dst->type == METADATA_XFRM)
+			dst_release(one_md_dst->u.xfrm_info.dst_orig);
 	}
 #endif
 	free_percpu(md_dst);
diff --git a/net/xfrm/Makefile b/net/xfrm/Makefile
index 08a2870fdd36..cd47f88921f5 100644
--- a/net/xfrm/Makefile
+++ b/net/xfrm/Makefile
@@ -5,6 +5,12 @@ 
 
 xfrm_interface-$(CONFIG_XFRM_INTERFACE) += xfrm_interface_core.o
 
+ifeq ($(CONFIG_XFRM_INTERFACE),m)
+xfrm_interface-$(CONFIG_DEBUG_INFO_BTF_MODULES) += xfrm_interface_bpf.o
+else ifeq ($(CONFIG_XFRM_INTERFACE),y)
+xfrm_interface-$(CONFIG_DEBUG_INFO_BTF) += xfrm_interface_bpf.o
+endif
+
 obj-$(CONFIG_XFRM) := xfrm_policy.o xfrm_state.o xfrm_hash.o \
 		      xfrm_input.o xfrm_output.o \
 		      xfrm_sysctl.o xfrm_replay.o xfrm_device.o
diff --git a/net/xfrm/xfrm_interface_bpf.c b/net/xfrm/xfrm_interface_bpf.c
new file mode 100644
index 000000000000..d3997ab7cc28
--- /dev/null
+++ b/net/xfrm/xfrm_interface_bpf.c
@@ -0,0 +1,92 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/* Unstable XFRM Helpers for TC-BPF hook
+ *
+ * These are called from SCHED_CLS BPF programs. Note that it is
+ * allowed to break compatibility for these functions since the interface they
+ * are exposed through to BPF programs is explicitly unstable.
+ */
+
+#include <linux/bpf.h>
+#include <linux/btf_ids.h>
+
+#include <net/dst_metadata.h>
+#include <net/xfrm.h>
+
+struct bpf_xfrm_info {
+	u32 if_id;
+	int link;
+};
+
+static struct metadata_dst __percpu *xfrm_md_dst;
+__diag_push();
+__diag_ignore_all("-Wmissing-prototypes",
+		  "Global functions as their definitions will be in xfrm_interface BTF");
+
+__used noinline
+int bpf_skb_get_xfrm_info(struct __sk_buff *skb_ctx, struct bpf_xfrm_info *to)
+{
+	struct sk_buff *skb = (struct sk_buff *)skb_ctx;
+	struct xfrm_md_info *info;
+
+	memset(to, 0, sizeof(*to));
+
+	info = skb_xfrm_md_info(skb);
+	if (!info)
+		return -EINVAL;
+
+	to->if_id = info->if_id;
+	to->link = info->link;
+	return 0;
+}
+
+__used noinline
+int bpf_skb_set_xfrm_info(struct __sk_buff *skb_ctx,
+			  const struct bpf_xfrm_info *from)
+{
+	struct sk_buff *skb = (struct sk_buff *)skb_ctx;
+	struct metadata_dst *md_dst;
+	struct xfrm_md_info *info;
+
+	if (unlikely(skb_metadata_dst(skb)))
+		return -EINVAL;
+
+	md_dst = this_cpu_ptr(xfrm_md_dst);
+
+	info = &md_dst->u.xfrm_info;
+	memset(info, 0, sizeof(*info));
+
+	info->if_id = from->if_id;
+	info->link = from->link;
+	info->dst_orig = skb_dst(skb);
+
+	dst_hold((struct dst_entry *)md_dst);
+	skb_dst_set(skb, (struct dst_entry *)md_dst);
+	return 0;
+}
+
+__diag_pop()
+
+BTF_SET8_START(xfrm_ifc_kfunc_set)
+BTF_ID_FLAGS(func, bpf_skb_get_xfrm_info)
+BTF_ID_FLAGS(func, bpf_skb_set_xfrm_info)
+BTF_SET8_END(xfrm_ifc_kfunc_set)
+
+static const struct btf_kfunc_id_set xfrm_interface_kfunc_set = {
+	.owner = THIS_MODULE,
+	.set   = &xfrm_ifc_kfunc_set,
+};
+
+int __init register_xfrm_interface_bpf(void)
+{
+	xfrm_md_dst = metadata_dst_alloc_percpu(0, METADATA_XFRM,
+						GFP_KERNEL);
+	if (!xfrm_md_dst)
+		return -ENOMEM;
+	return register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS,
+					 &xfrm_interface_kfunc_set);
+}
+
+void __exit cleanup_xfrm_interface_bpf(void)
+{
+	metadata_dst_free_percpu(xfrm_md_dst);
+}
diff --git a/net/xfrm/xfrm_interface_core.c b/net/xfrm/xfrm_interface_core.c
index 5a67b120c4db..1e1e8e965939 100644
--- a/net/xfrm/xfrm_interface_core.c
+++ b/net/xfrm/xfrm_interface_core.c
@@ -396,6 +396,14 @@  xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl)
 
 		if_id = md_info->if_id;
 		fl->flowi_oif = md_info->link;
+		if (md_info->dst_orig) {
+			struct dst_entry *tmp_dst = dst;
+
+			dst = md_info->dst_orig;
+			skb_dst_set(skb, dst);
+			md_info->dst_orig = NULL;
+			dst_release(tmp_dst);
+		}
 	} else {
 		if_id = xi->p.if_id;
 	}
@@ -1162,12 +1170,18 @@  static int __init xfrmi_init(void)
 	if (err < 0)
 		goto rtnl_link_failed;
 
+	err = register_xfrm_interface_bpf();
+	if (err < 0)
+		goto kfunc_failed;
+
 	lwtunnel_encap_add_ops(&xfrmi_encap_ops, LWTUNNEL_ENCAP_XFRM);
 
 	xfrm_if_register_cb(&xfrm_if_cb);
 
 	return err;
 
+kfunc_failed:
+	rtnl_link_unregister(&xfrmi_link_ops);
 rtnl_link_failed:
 	xfrmi6_fini();
 xfrmi6_failed:
@@ -1183,6 +1197,7 @@  static void __exit xfrmi_fini(void)
 {
 	xfrm_if_unregister_cb();
 	lwtunnel_encap_del_ops(&xfrmi_encap_ops, LWTUNNEL_ENCAP_XFRM);
+	cleanup_xfrm_interface_bpf();
 	rtnl_link_unregister(&xfrmi_link_ops);
 	xfrmi4_fini();
 	xfrmi6_fini();