diff mbox series

[PATCHv2,bpf-next,2/4] xdp: extend xdp_redirect_map with broadcast support

Message ID 20210309101321.2138655-3-liuhangbin@gmail.com (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series xdp: extend xdp_redirect_map with broadcast support | expand

Checks

Context Check Description
netdev/cover_letter success Link
netdev/fixes_present success Link
netdev/patch_count success Link
netdev/tree_selection success Clearly marked for bpf-next
netdev/subject_prefix success Link
netdev/cc_maintainers warning 10 maintainers not CCed: joe@cilium.io yhs@fb.com hawk@kernel.org kpsingh@kernel.org andrii@kernel.org kafai@fb.com songliubraving@fb.com davem@davemloft.net quentin@isovalent.com kuba@kernel.org
netdev/source_inline success Was 0 now: 0
netdev/verify_signedoff success Link
netdev/module_param success Was 0 now: 0
netdev/build_32bit success Errors and warnings before: 11835 this patch: 11835
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/verify_fixes success Link
netdev/checkpatch warning WARNING: line length of 81 exceeds 80 columns WARNING: line length of 84 exceeds 80 columns WARNING: line length of 86 exceeds 80 columns WARNING: line length of 88 exceeds 80 columns WARNING: please, no space before tabs
netdev/build_allmodconfig_warn success Errors and warnings before: 12482 this patch: 12482
netdev/header_inline success Link

Commit Message

Hangbin Liu March 9, 2021, 10:13 a.m. UTC
This patch add two flags BPF_F_BROADCAST and BPF_F_EXCLUDE_INGRESS to extend
xdp_redirect_map for broadcast support.

Keep the general data path in net/core/filter.c and the native data
path in kernel/bpf/devmap.c so we can use direct calls to get better
performace.

Here is the performance result by using xdp_redirect_{map, map_multi} in
sample/bpf and send pkts via pktgen cmd:
./pktgen_sample03_burst_single_flow.sh -i eno1 -d $dst_ip -m $dst_mac -t 10 -s 64

There are some drop back as we need to loop the map and get each
interface.

Version      | Test                                | Generic | Native
5.11         | redirect_map        i40e->i40e      |    1.9M |  9.3M
5.11         | redirect_map        i40e->veth      |    1.5M | 11.2M
5.11 + patch | redirect_map        i40e->i40e      |    1.9M |  9.6M
5.11 + patch | redirect_map        i40e->veth      |    1.5M | 11.9M
5.11 + patch | redirect_map_multi  i40e->i40e      |    1.5M |  7.7M
5.11 + patch | redirect_map_multi  i40e->veth      |    1.2M |  9.1M
5.11 + patch | redirect_map_multi  i40e->mlx4+veth |    0.9M |  3.2M

v2: fix flag renaming issue in v1

Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
---
 include/linux/bpf.h            |  16 +++++
 include/net/xdp.h              |   1 +
 include/uapi/linux/bpf.h       |  17 ++++-
 kernel/bpf/devmap.c            | 119 +++++++++++++++++++++++++++++++++
 net/core/filter.c              |  74 ++++++++++++++++++--
 net/core/xdp.c                 |  29 ++++++++
 tools/include/uapi/linux/bpf.h |  17 ++++-
 7 files changed, 262 insertions(+), 11 deletions(-)

Comments

Toke Høiland-Jørgensen March 17, 2021, 12:03 p.m. UTC | #1
Hangbin Liu <liuhangbin@gmail.com> writes:

> This patch add two flags BPF_F_BROADCAST and BPF_F_EXCLUDE_INGRESS to extend
> xdp_redirect_map for broadcast support.
>
> Keep the general data path in net/core/filter.c and the native data
> path in kernel/bpf/devmap.c so we can use direct calls to get better
> performace.
>
> Here is the performance result by using xdp_redirect_{map, map_multi} in
> sample/bpf and send pkts via pktgen cmd:
> ./pktgen_sample03_burst_single_flow.sh -i eno1 -d $dst_ip -m $dst_mac -t 10 -s 64
>
> There are some drop back as we need to loop the map and get each
> interface.
>
> Version      | Test                                | Generic | Native
> 5.11         | redirect_map        i40e->i40e      |    1.9M |  9.3M
> 5.11         | redirect_map        i40e->veth      |    1.5M | 11.2M
> 5.11 + patch | redirect_map        i40e->i40e      |    1.9M |  9.6M
> 5.11 + patch | redirect_map        i40e->veth      |    1.5M | 11.9M
> 5.11 + patch | redirect_map_multi  i40e->i40e      |    1.5M |  7.7M
> 5.11 + patch | redirect_map_multi  i40e->veth      |    1.2M |  9.1M
> 5.11 + patch | redirect_map_multi  i40e->mlx4+veth |    0.9M |  3.2M
>
> v2: fix flag renaming issue in v1
>
> Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>

FYI, this no longer applies to bpf-next due to Björn's refactor in
commit: ee75aef23afe ("bpf, xdp: Restructure redirect actions")

Also, two small nits below:


> ---
>  include/linux/bpf.h            |  16 +++++
>  include/net/xdp.h              |   1 +
>  include/uapi/linux/bpf.h       |  17 ++++-
>  kernel/bpf/devmap.c            | 119 +++++++++++++++++++++++++++++++++
>  net/core/filter.c              |  74 ++++++++++++++++++--
>  net/core/xdp.c                 |  29 ++++++++
>  tools/include/uapi/linux/bpf.h |  17 ++++-
>  7 files changed, 262 insertions(+), 11 deletions(-)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index c931bc97019d..bb07ccd170f2 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -1458,6 +1458,9 @@ int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
>  		    struct net_device *dev_rx);
>  int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
>  		    struct net_device *dev_rx);
> +bool dst_dev_is_ingress(struct bpf_dtab_netdev *obj, int ifindex);
> +int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
> +			  struct bpf_map *map, bool exclude_ingress);
>  int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
>  			     struct bpf_prog *xdp_prog);
>  bool dev_map_can_have_prog(struct bpf_map *map);
> @@ -1630,6 +1633,19 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
>  	return 0;
>  }
>  
> +static inline
> +bool dst_dev_is_ingress(struct bpf_dtab_netdev *obj, int ifindex)
> +{
> +	return false;
> +}
> +
> +static inline
> +int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
> +			  struct bpf_map *map, bool exclude_ingress)
> +{
> +	return 0;
> +}
> +
>  struct sk_buff;
>  
>  static inline int dev_map_generic_redirect(struct bpf_dtab_netdev *dst,
> diff --git a/include/net/xdp.h b/include/net/xdp.h
> index a5bc214a49d9..5533f0ab2afc 100644
> --- a/include/net/xdp.h
> +++ b/include/net/xdp.h
> @@ -170,6 +170,7 @@ struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf,
>  struct sk_buff *xdp_build_skb_from_frame(struct xdp_frame *xdpf,
>  					 struct net_device *dev);
>  int xdp_alloc_skb_bulk(void **skbs, int n_skb, gfp_t gfp);
> +struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf);
>  
>  static inline
>  void xdp_convert_frame_to_buff(struct xdp_frame *frame, struct xdp_buff *xdp)
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 2d3036e292a9..5982ceb217dc 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -2508,8 +2508,12 @@ union bpf_attr {
>   * 		The lower two bits of *flags* are used as the return code if
>   * 		the map lookup fails. This is so that the return value can be
>   * 		one of the XDP program return codes up to **XDP_TX**, as chosen
> - * 		by the caller. Any higher bits in the *flags* argument must be
> - * 		unset.
> + * 		by the caller. The higher bits of *flags* can be set to
> + * 		BPF_F_BROADCAST or BPF_F_EXCLUDE_INGRESS as defined below.
> + *
> + * 		With BPF_F_BROADCAST the packet will be broadcasted to all the
> + * 		interfaces in the map. with BPF_F_EXCLUDE_INGRESS the ingress
> + * 		interface will be excluded when do broadcasting.
>   *
>   * 		See also **bpf_redirect**\ (), which only supports redirecting
>   * 		to an ifindex, but doesn't require a map to do so.
> @@ -5004,6 +5008,15 @@ enum {
>  	BPF_F_BPRM_SECUREEXEC	= (1ULL << 0),
>  };
>  
> +/* Flags for bpf_redirect_map helper */
> +enum {
> +	BPF_F_BROADCAST		= (1ULL << 3),
> +	BPF_F_EXCLUDE_INGRESS	= (1ULL << 4),
> +};
> +
> +#define BPF_F_ACTION_MASK (XDP_ABORTED | XDP_DROP | XDP_PASS | XDP_TX)
> +#define BPF_F_REDIR_MASK (BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS)
> +
>  #define __bpf_md_ptr(type, name)	\
>  union {					\
>  	type name;			\
> diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
> index f80cf5036d39..ad616a043d2a 100644
> --- a/kernel/bpf/devmap.c
> +++ b/kernel/bpf/devmap.c
> @@ -519,6 +519,125 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
>  	return __xdp_enqueue(dev, xdp, dev_rx, dst->xdp_prog);
>  }
>  
> +/* Use direct call in fast path instead of map->ops->map_get_next_key() */
> +static int devmap_get_next_key(struct bpf_map *map, void *key, void *next_key)
> +{
> +	switch (map->map_type) {
> +	case BPF_MAP_TYPE_DEVMAP:
> +		return dev_map_get_next_key(map, key, next_key);
> +	case BPF_MAP_TYPE_DEVMAP_HASH:
> +		return dev_map_hash_get_next_key(map, key, next_key);
> +	default:
> +		break;
> +	}
> +
> +	return -ENOENT;
> +}
> +
> +bool dst_dev_is_ingress(struct bpf_dtab_netdev *dst, int ifindex)
> +{
> +	return dst->dev->ifindex == ifindex;
> +}
> +
> +static struct bpf_dtab_netdev *devmap_get_next_obj(struct xdp_buff *xdp,
> +						   struct bpf_map *map,
> +						   u32 *key, u32 *next_key,
> +						   int ex_ifindex)
> +{
> +	struct bpf_dtab_netdev *obj;
> +	struct net_device *dev;
> +	u32 *tmp_key = key;

why is tmp_key needed? you're not using key for anything else, so you
could just substitute that for all of the uses of tmp_key below?

> +	u32 index;
> +	int err;
> +
> +	err = devmap_get_next_key(map, tmp_key, next_key);
> +	if (err)
> +		return NULL;
> +
> +	/* When using dev map hash, we could restart the hashtab traversal
> +	 * in case the key has been updated/removed in the mean time.
> +	 * So we may end up potentially looping due to traversal restarts
> +	 * from first elem.
> +	 *
> +	 * Let's use map's max_entries to limit the loop number.
> +	 */
> +	for (index = 0; index < map->max_entries; index++) {
> +		switch (map->map_type) {
> +		case BPF_MAP_TYPE_DEVMAP:
> +			obj = __dev_map_lookup_elem(map, *next_key);
> +			break;
> +		case BPF_MAP_TYPE_DEVMAP_HASH:
> +			obj = __dev_map_hash_lookup_elem(map, *next_key);
> +			break;
> +		default:
> +			break;
> +		}
> +
> +		if (!obj || dst_dev_is_ingress(obj, ex_ifindex))
> +			goto find_next;
> +
> +		dev = obj->dev;
> +
> +		if (!dev->netdev_ops->ndo_xdp_xmit)
> +			goto find_next;
> +
> +		err = xdp_ok_fwd_dev(dev, xdp->data_end - xdp->data);
> +		if (unlikely(err))
> +			goto find_next;
> +
> +		return obj;
> +
> +find_next:
> +		tmp_key = next_key;
> +		err = devmap_get_next_key(map, tmp_key, next_key);
> +		if (err)
> +			break;
> +	}
> +
> +	return NULL;
> +}
> +
> +int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
> +			  struct bpf_map *map, bool exclude_ingress)
> +{
> +	struct bpf_dtab_netdev *obj = NULL, *next_obj = NULL;
> +	struct xdp_frame *xdpf, *nxdpf;
> +	int ex_ifindex;
> +	u32 key, next_key;

Out of reverse-xmas-tree order...


-Toke
Hangbin Liu March 18, 2021, 3:52 a.m. UTC | #2
On Wed, Mar 17, 2021 at 01:03:02PM +0100, Toke Høiland-Jørgensen wrote:
> FYI, this no longer applies to bpf-next due to Björn's refactor in
> commit: ee75aef23afe ("bpf, xdp: Restructure redirect actions")

Thanks Toke, I need to see how to get the map via map_id, does
bpf_map_get_curr_or_next() works? Should I call bpf_map_put() after using?

The ri->flags = flags also need to be add back as we need to use the flags
value.

This looks like an opposite of Björn's restructure...

And I have expected another rebase after Lorenzo's "bpf: devmap: move drop
error path to devmap for XDP_REDIRECT"

> 
> Also, two small nits below:

Thanks, I will fix them if there is a way to do the rebase.

Hangbin
Björn Töpel March 18, 2021, 8:20 a.m. UTC | #3
On Thu, 18 Mar 2021 at 04:52, Hangbin Liu <liuhangbin@gmail.com> wrote:
>
> On Wed, Mar 17, 2021 at 01:03:02PM +0100, Toke Høiland-Jørgensen wrote:
> > FYI, this no longer applies to bpf-next due to Björn's refactor in
> > commit: ee75aef23afe ("bpf, xdp: Restructure redirect actions")
>
> Thanks Toke, I need to see how to get the map via map_id, does
> bpf_map_get_curr_or_next() works? Should I call bpf_map_put() after using?
>
> The ri->flags = flags also need to be add back as we need to use the flags
> value.
>

Hmm, I was under the impression that ri->flags was only to be used by
the new bpf_redirect_map_multi(), but now I see that you're planning
to use the bpf_redirect_map instead. Well... I guess the flag is back
then.


Björn
Toke Høiland-Jørgensen March 18, 2021, 2:19 p.m. UTC | #4
Hangbin Liu <liuhangbin@gmail.com> writes:

> On Wed, Mar 17, 2021 at 01:03:02PM +0100, Toke Høiland-Jørgensen wrote:
>> FYI, this no longer applies to bpf-next due to Björn's refactor in
>> commit: ee75aef23afe ("bpf, xdp: Restructure redirect actions")
>
> Thanks Toke, I need to see how to get the map via map_id, does
> bpf_map_get_curr_or_next() works? Should I call bpf_map_put() after
> using?

I would expect that to be terrible for performance; I think it would be
better to just add back the map pointer into struct bpf_redirect_info.
If you only set the map pointer when the multicast flag is set, you can
just check that pointer to disambiguate between when you need to call
dev_map_enqueue() and dev_map_enqueue_multi(), in which case you don't
need to add back the flags member...

-Toke
Hangbin Liu March 23, 2021, 2:49 a.m. UTC | #5
On Thu, Mar 18, 2021 at 03:19:47PM +0100, Toke Høiland-Jørgensen wrote:
> Hangbin Liu <liuhangbin@gmail.com> writes:
> 
> > On Wed, Mar 17, 2021 at 01:03:02PM +0100, Toke Høiland-Jørgensen wrote:
> >> FYI, this no longer applies to bpf-next due to Björn's refactor in
> >> commit: ee75aef23afe ("bpf, xdp: Restructure redirect actions")
> >
> > Thanks Toke, I need to see how to get the map via map_id, does
> > bpf_map_get_curr_or_next() works? Should I call bpf_map_put() after
> > using?
> 
> I would expect that to be terrible for performance; I think it would be
> better to just add back the map pointer into struct bpf_redirect_info.
> If you only set the map pointer when the multicast flag is set, you can
> just check that pointer to disambiguate between when you need to call
> dev_map_enqueue() and dev_map_enqueue_multi(), in which case you don't
> need to add back the flags member...

There are 2 flags, BROADCAST and EXCLUDE_INGRESS. There is no way
to only check the map pointer and ignore flags..

Thanks
Hangbin
Toke Høiland-Jørgensen March 23, 2021, 10:15 a.m. UTC | #6
Hangbin Liu <liuhangbin@gmail.com> writes:

> On Thu, Mar 18, 2021 at 03:19:47PM +0100, Toke Høiland-Jørgensen wrote:
>> Hangbin Liu <liuhangbin@gmail.com> writes:
>> 
>> > On Wed, Mar 17, 2021 at 01:03:02PM +0100, Toke Høiland-Jørgensen wrote:
>> >> FYI, this no longer applies to bpf-next due to Björn's refactor in
>> >> commit: ee75aef23afe ("bpf, xdp: Restructure redirect actions")
>> >
>> > Thanks Toke, I need to see how to get the map via map_id, does
>> > bpf_map_get_curr_or_next() works? Should I call bpf_map_put() after
>> > using?
>> 
>> I would expect that to be terrible for performance; I think it would be
>> better to just add back the map pointer into struct bpf_redirect_info.
>> If you only set the map pointer when the multicast flag is set, you can
>> just check that pointer to disambiguate between when you need to call
>> dev_map_enqueue() and dev_map_enqueue_multi(), in which case you don't
>> need to add back the flags member...
>
> There are 2 flags, BROADCAST and EXCLUDE_INGRESS. There is no way
> to only check the map pointer and ignore flags..

Ah, right, of course, my bad :)

Well, in that case adding both members back is probably the right thing
to do...

-Toke
diff mbox series

Patch

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index c931bc97019d..bb07ccd170f2 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1458,6 +1458,9 @@  int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
 		    struct net_device *dev_rx);
 int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
 		    struct net_device *dev_rx);
+bool dst_dev_is_ingress(struct bpf_dtab_netdev *obj, int ifindex);
+int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
+			  struct bpf_map *map, bool exclude_ingress);
 int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
 			     struct bpf_prog *xdp_prog);
 bool dev_map_can_have_prog(struct bpf_map *map);
@@ -1630,6 +1633,19 @@  int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
 	return 0;
 }
 
+static inline
+bool dst_dev_is_ingress(struct bpf_dtab_netdev *obj, int ifindex)
+{
+	return false;
+}
+
+static inline
+int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
+			  struct bpf_map *map, bool exclude_ingress)
+{
+	return 0;
+}
+
 struct sk_buff;
 
 static inline int dev_map_generic_redirect(struct bpf_dtab_netdev *dst,
diff --git a/include/net/xdp.h b/include/net/xdp.h
index a5bc214a49d9..5533f0ab2afc 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -170,6 +170,7 @@  struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf,
 struct sk_buff *xdp_build_skb_from_frame(struct xdp_frame *xdpf,
 					 struct net_device *dev);
 int xdp_alloc_skb_bulk(void **skbs, int n_skb, gfp_t gfp);
+struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf);
 
 static inline
 void xdp_convert_frame_to_buff(struct xdp_frame *frame, struct xdp_buff *xdp)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 2d3036e292a9..5982ceb217dc 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2508,8 +2508,12 @@  union bpf_attr {
  * 		The lower two bits of *flags* are used as the return code if
  * 		the map lookup fails. This is so that the return value can be
  * 		one of the XDP program return codes up to **XDP_TX**, as chosen
- * 		by the caller. Any higher bits in the *flags* argument must be
- * 		unset.
+ * 		by the caller. The higher bits of *flags* can be set to
+ * 		BPF_F_BROADCAST or BPF_F_EXCLUDE_INGRESS as defined below.
+ *
+ * 		With BPF_F_BROADCAST the packet will be broadcasted to all the
+ * 		interfaces in the map. with BPF_F_EXCLUDE_INGRESS the ingress
+ * 		interface will be excluded when do broadcasting.
  *
  * 		See also **bpf_redirect**\ (), which only supports redirecting
  * 		to an ifindex, but doesn't require a map to do so.
@@ -5004,6 +5008,15 @@  enum {
 	BPF_F_BPRM_SECUREEXEC	= (1ULL << 0),
 };
 
+/* Flags for bpf_redirect_map helper */
+enum {
+	BPF_F_BROADCAST		= (1ULL << 3),
+	BPF_F_EXCLUDE_INGRESS	= (1ULL << 4),
+};
+
+#define BPF_F_ACTION_MASK (XDP_ABORTED | XDP_DROP | XDP_PASS | XDP_TX)
+#define BPF_F_REDIR_MASK (BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS)
+
 #define __bpf_md_ptr(type, name)	\
 union {					\
 	type name;			\
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index f80cf5036d39..ad616a043d2a 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -519,6 +519,125 @@  int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
 	return __xdp_enqueue(dev, xdp, dev_rx, dst->xdp_prog);
 }
 
+/* Use direct call in fast path instead of map->ops->map_get_next_key() */
+static int devmap_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+	switch (map->map_type) {
+	case BPF_MAP_TYPE_DEVMAP:
+		return dev_map_get_next_key(map, key, next_key);
+	case BPF_MAP_TYPE_DEVMAP_HASH:
+		return dev_map_hash_get_next_key(map, key, next_key);
+	default:
+		break;
+	}
+
+	return -ENOENT;
+}
+
+bool dst_dev_is_ingress(struct bpf_dtab_netdev *dst, int ifindex)
+{
+	return dst->dev->ifindex == ifindex;
+}
+
+static struct bpf_dtab_netdev *devmap_get_next_obj(struct xdp_buff *xdp,
+						   struct bpf_map *map,
+						   u32 *key, u32 *next_key,
+						   int ex_ifindex)
+{
+	struct bpf_dtab_netdev *obj;
+	struct net_device *dev;
+	u32 *tmp_key = key;
+	u32 index;
+	int err;
+
+	err = devmap_get_next_key(map, tmp_key, next_key);
+	if (err)
+		return NULL;
+
+	/* When using dev map hash, we could restart the hashtab traversal
+	 * in case the key has been updated/removed in the mean time.
+	 * So we may end up potentially looping due to traversal restarts
+	 * from first elem.
+	 *
+	 * Let's use map's max_entries to limit the loop number.
+	 */
+	for (index = 0; index < map->max_entries; index++) {
+		switch (map->map_type) {
+		case BPF_MAP_TYPE_DEVMAP:
+			obj = __dev_map_lookup_elem(map, *next_key);
+			break;
+		case BPF_MAP_TYPE_DEVMAP_HASH:
+			obj = __dev_map_hash_lookup_elem(map, *next_key);
+			break;
+		default:
+			break;
+		}
+
+		if (!obj || dst_dev_is_ingress(obj, ex_ifindex))
+			goto find_next;
+
+		dev = obj->dev;
+
+		if (!dev->netdev_ops->ndo_xdp_xmit)
+			goto find_next;
+
+		err = xdp_ok_fwd_dev(dev, xdp->data_end - xdp->data);
+		if (unlikely(err))
+			goto find_next;
+
+		return obj;
+
+find_next:
+		tmp_key = next_key;
+		err = devmap_get_next_key(map, tmp_key, next_key);
+		if (err)
+			break;
+	}
+
+	return NULL;
+}
+
+int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
+			  struct bpf_map *map, bool exclude_ingress)
+{
+	struct bpf_dtab_netdev *obj = NULL, *next_obj = NULL;
+	struct xdp_frame *xdpf, *nxdpf;
+	int ex_ifindex;
+	u32 key, next_key;
+
+	ex_ifindex = exclude_ingress ? dev_rx->ifindex : 0;
+
+	/* Find first available obj */
+	obj = devmap_get_next_obj(xdp, map, NULL, &key, ex_ifindex);
+	if (!obj)
+		return -ENOENT;
+
+	xdpf = xdp_convert_buff_to_frame(xdp);
+	if (unlikely(!xdpf))
+		return -EOVERFLOW;
+
+	for (;;) {
+		/* Check if we still have one more available obj */
+		next_obj = devmap_get_next_obj(xdp, map, &key, &next_key, ex_ifindex);
+		if (!next_obj) {
+			bq_enqueue(obj->dev, xdpf, dev_rx, obj->xdp_prog);
+			return 0;
+		}
+
+		nxdpf = xdpf_clone(xdpf);
+		if (unlikely(!nxdpf)) {
+			xdp_return_frame_rx_napi(xdpf);
+			return -ENOMEM;
+		}
+
+		bq_enqueue(obj->dev, nxdpf, dev_rx, obj->xdp_prog);
+
+		/* Deal with next obj */
+		obj = next_obj;
+		key = next_key;
+	}
+}
+
 int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
 			     struct bpf_prog *xdp_prog)
 {
diff --git a/net/core/filter.c b/net/core/filter.c
index 588b19ba0da8..3736a9116be9 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3919,12 +3919,17 @@  static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = {
 };
 
 static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
-			    struct bpf_map *map, struct xdp_buff *xdp)
+			    struct bpf_map *map, struct xdp_buff *xdp,
+			    u32 flags)
 {
 	switch (map->map_type) {
 	case BPF_MAP_TYPE_DEVMAP:
 	case BPF_MAP_TYPE_DEVMAP_HASH:
-		return dev_map_enqueue(fwd, xdp, dev_rx);
+		if (flags & BPF_F_BROADCAST)
+			return dev_map_enqueue_multi(xdp, dev_rx, map,
+						     flags & BPF_F_EXCLUDE_INGRESS);
+		else
+			return dev_map_enqueue(fwd, xdp, dev_rx);
 	case BPF_MAP_TYPE_CPUMAP:
 		return cpu_map_enqueue(fwd, xdp, dev_rx);
 	case BPF_MAP_TYPE_XSKMAP:
@@ -3998,7 +4003,7 @@  int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
 
 		err = dev_xdp_enqueue(fwd, xdp, dev);
 	} else {
-		err = __bpf_tx_xdp_map(dev, fwd, map, xdp);
+		err = __bpf_tx_xdp_map(dev, fwd, map, xdp, ri->flags);
 	}
 
 	if (unlikely(err))
@@ -4012,6 +4017,57 @@  int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
 }
 EXPORT_SYMBOL_GPL(xdp_do_redirect);
 
+static int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb,
+				  struct bpf_prog *xdp_prog, struct bpf_map *map,
+				  bool exclude_ingress)
+{
+	struct bpf_dtab_netdev *dst;
+	u32 key, next_key, index;
+	struct sk_buff *nskb;
+	void *fwd;
+	int err;
+
+	err = map->ops->map_get_next_key(map, NULL, &key);
+	if (err)
+		return err;
+
+	/* When using dev map hash, we could restart the hashtab traversal
+	 * in case the key has been updated/removed in the mean time.
+	 * So we may end up potentially looping due to traversal restarts
+	 * from first elem.
+	 *
+	 * Let's use map's max_entries to limit the loop number.
+	 */
+
+	for (index = 0; index < map->max_entries; index++) {
+		fwd = __xdp_map_lookup_elem(map, key);
+		if (fwd) {
+			dst = (struct bpf_dtab_netdev *)fwd;
+			if (dst_dev_is_ingress(dst, exclude_ingress ? dev->ifindex : 0))
+				goto find_next;
+
+			nskb = skb_clone(skb, GFP_ATOMIC);
+			if (!nskb)
+				return -ENOMEM;
+
+			/* Try forword next one no mater the current forward
+			 * succeed or not.
+			 */
+			dev_map_generic_redirect(dst, nskb, xdp_prog);
+		}
+
+find_next:
+		err = map->ops->map_get_next_key(map, &key, &next_key);
+		if (err)
+			break;
+
+		key = next_key;
+	}
+
+	consume_skb(skb);
+	return 0;
+}
+
 static int xdp_do_generic_redirect_map(struct net_device *dev,
 				       struct sk_buff *skb,
 				       struct xdp_buff *xdp,
@@ -4031,7 +4087,11 @@  static int xdp_do_generic_redirect_map(struct net_device *dev,
 	    map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
 		struct bpf_dtab_netdev *dst = fwd;
 
-		err = dev_map_generic_redirect(dst, skb, xdp_prog);
+		if (ri->flags & BPF_F_BROADCAST)
+			err = dev_map_redirect_multi(dev, skb, xdp_prog, map,
+						     ri->flags & BPF_F_EXCLUDE_INGRESS);
+		else
+			err = dev_map_generic_redirect(dst, skb, xdp_prog);
 		if (unlikely(err))
 			goto err;
 	} else if (map->map_type == BPF_MAP_TYPE_XSKMAP) {
@@ -4115,18 +4175,18 @@  BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex,
 	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
 
 	/* Lower bits of the flags are used as return code on lookup failure */
-	if (unlikely(flags > XDP_TX))
+	if (unlikely(flags & ~(BPF_F_ACTION_MASK | BPF_F_REDIR_MASK)))
 		return XDP_ABORTED;
 
 	ri->tgt_value = __xdp_map_lookup_elem(map, ifindex);
-	if (unlikely(!ri->tgt_value)) {
+	if (unlikely(!ri->tgt_value) && !(flags & BPF_F_BROADCAST)) {
 		/* If the lookup fails we want to clear out the state in the
 		 * redirect_info struct completely, so that if an eBPF program
 		 * performs multiple lookups, the last one always takes
 		 * precedence.
 		 */
 		WRITE_ONCE(ri->map, NULL);
-		return flags;
+		return flags & BPF_F_ACTION_MASK;
 	}
 
 	ri->flags = flags;
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 05354976c1fc..aba84d04642b 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -583,3 +583,32 @@  struct sk_buff *xdp_build_skb_from_frame(struct xdp_frame *xdpf,
 	return __xdp_build_skb_from_frame(xdpf, skb, dev);
 }
 EXPORT_SYMBOL_GPL(xdp_build_skb_from_frame);
+
+struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf)
+{
+	unsigned int headroom, totalsize;
+	struct xdp_frame *nxdpf;
+	struct page *page;
+	void *addr;
+
+	headroom = xdpf->headroom + sizeof(*xdpf);
+	totalsize = headroom + xdpf->len;
+
+	if (unlikely(totalsize > PAGE_SIZE))
+		return NULL;
+	page = dev_alloc_page();
+	if (!page)
+		return NULL;
+	addr = page_to_virt(page);
+
+	memcpy(addr, xdpf, totalsize);
+
+	nxdpf = addr;
+	nxdpf->data = addr + headroom;
+	nxdpf->frame_sz = PAGE_SIZE;
+	nxdpf->mem.type = MEM_TYPE_PAGE_ORDER0;
+	nxdpf->mem.id = 0;
+
+	return nxdpf;
+}
+EXPORT_SYMBOL_GPL(xdpf_clone);
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 2d3036e292a9..5982ceb217dc 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -2508,8 +2508,12 @@  union bpf_attr {
  * 		The lower two bits of *flags* are used as the return code if
  * 		the map lookup fails. This is so that the return value can be
  * 		one of the XDP program return codes up to **XDP_TX**, as chosen
- * 		by the caller. Any higher bits in the *flags* argument must be
- * 		unset.
+ * 		by the caller. The higher bits of *flags* can be set to
+ * 		BPF_F_BROADCAST or BPF_F_EXCLUDE_INGRESS as defined below.
+ *
+ * 		With BPF_F_BROADCAST the packet will be broadcasted to all the
+ * 		interfaces in the map. with BPF_F_EXCLUDE_INGRESS the ingress
+ * 		interface will be excluded when do broadcasting.
  *
  * 		See also **bpf_redirect**\ (), which only supports redirecting
  * 		to an ifindex, but doesn't require a map to do so.
@@ -5004,6 +5008,15 @@  enum {
 	BPF_F_BPRM_SECUREEXEC	= (1ULL << 0),
 };
 
+/* Flags for bpf_redirect_map helper */
+enum {
+	BPF_F_BROADCAST		= (1ULL << 3),
+	BPF_F_EXCLUDE_INGRESS	= (1ULL << 4),
+};
+
+#define BPF_F_ACTION_MASK (XDP_ABORTED | XDP_DROP | XDP_PASS | XDP_TX)
+#define BPF_F_REDIR_MASK (BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS)
+
 #define __bpf_md_ptr(type, name)	\
 union {					\
 	type name;			\