diff mbox series

[PATCHv17,bpf-next,1/6] bpf: run devmap xdp_prog on flush instead of bulk enqueue

Message ID 20210125124516.3098129-2-liuhangbin@gmail.com (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series xdp: add a new helper for dev map multicast support | expand

Checks

Context Check Description
netdev/cover_letter success Link
netdev/fixes_present success Link
netdev/patch_count success Link
netdev/tree_selection success Clearly marked for bpf-next
netdev/subject_prefix success Link
netdev/cc_maintainers warning 8 maintainers not CCed: davem@davemloft.net songliubraving@fb.com andrii@kernel.org hawk@kernel.org kpsingh@kernel.org kuba@kernel.org kafai@fb.com yhs@fb.com
netdev/source_inline success Was 0 now: 0
netdev/verify_signedoff success Link
netdev/module_param success Was 0 now: 0
netdev/build_32bit success Errors and warnings before: 1 this patch: 1
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/verify_fixes success Link
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 223 lines checked
netdev/build_allmodconfig_warn success Errors and warnings before: 1 this patch: 1
netdev/header_inline success Link
netdev/stable success Stable not CCed

Commit Message

Hangbin Liu Jan. 25, 2021, 12:45 p.m. UTC
From: Jesper Dangaard Brouer <brouer@redhat.com>

This changes the devmap XDP program support to run the program when the
bulk queue is flushed instead of before the frame is enqueued. This has
a couple of benefits:

- It "sorts" the packets by destination devmap entry, and then runs the
  same BPF program on all the packets in sequence. This ensures that we
  keep the XDP program and destination device properties hot in I-cache.

- It makes the multicast implementation simpler because it can just
  enqueue packets using bq_enqueue() without having to deal with the
  devmap program at all.

The drawback is that if the devmap program drops the packet, the enqueue
step is redundant. However, arguably this is mostly visible in a
micro-benchmark, and with more mixed traffic the I-cache benefit should
win out. The performance impact of just this patch is as follows:

The bq_xmit_all's logic is also refactored and error label is removed.
When bq_xmit_all() is called from bq_enqueue(), another packet will
always be enqueued immediately after, so clearing dev_rx, xdp_prog and
flush_node in bq_xmit_all() is redundant. Let's move the clear to
__dev_flush(), and only check them once in bq_enqueue() since they are
all modified together.

By using xdp_redirect_map in sample/bpf and send pkts via pktgen cmd:
./pktgen_sample03_burst_single_flow.sh -i eno1 -d $dst_ip -m $dst_mac -t 10 -s 64

There are about +/- 0.1M deviation for native testing, the performance
improved for the base-case, but some drop back with xdp devmap prog attached.

Version          | Test                           | Generic | Native | Native + 2nd xdp_prog
5.10 rc6         | xdp_redirect_map   i40e->i40e  |    2.0M |   9.1M |  8.0M
5.10 rc6         | xdp_redirect_map   i40e->veth  |    1.7M |  11.0M |  9.7M
5.10 rc6 + patch | xdp_redirect_map   i40e->i40e  |    2.0M |   9.5M |  7.5M
5.10 rc6 + patch | xdp_redirect_map   i40e->veth  |    1.7M |  11.6M |  9.1M

[1] https://lore.kernel.org/bpf/20210122025007.2968381-1-liuhangbin@gmail.com

Reviewed-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Acked-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>

---
v17:
a) rename to_sent to to_send.
b) clear bq dev_rx, xdp_prog and flush_node in __dev_flush().

v16:
a) refactor bq_xmit_all logic and remove error label

v15:
a) do not use unlikely when checking bq->xdp_prog
b) return sent frames for dev_map_bpf_prog_run()

v14: no update, only rebase the code
v13: pass in xdp_prog through __xdp_enqueue()
v2-v12: no this patch
---
 kernel/bpf/devmap.c | 146 +++++++++++++++++++++++++-------------------
 1 file changed, 84 insertions(+), 62 deletions(-)

Comments

John Fastabend Jan. 27, 2021, 7:37 a.m. UTC | #1
Hangbin Liu wrote:
> From: Jesper Dangaard Brouer <brouer@redhat.com>
> 
> This changes the devmap XDP program support to run the program when the
> bulk queue is flushed instead of before the frame is enqueued. This has
> a couple of benefits:
> 
> - It "sorts" the packets by destination devmap entry, and then runs the
>   same BPF program on all the packets in sequence. This ensures that we
>   keep the XDP program and destination device properties hot in I-cache.
> 
> - It makes the multicast implementation simpler because it can just
>   enqueue packets using bq_enqueue() without having to deal with the
>   devmap program at all.
> 
> The drawback is that if the devmap program drops the packet, the enqueue
> step is redundant. However, arguably this is mostly visible in a
> micro-benchmark, and with more mixed traffic the I-cache benefit should
> win out. The performance impact of just this patch is as follows:
> 
> The bq_xmit_all's logic is also refactored and error label is removed.
> When bq_xmit_all() is called from bq_enqueue(), another packet will
> always be enqueued immediately after, so clearing dev_rx, xdp_prog and
> flush_node in bq_xmit_all() is redundant. Let's move the clear to
> __dev_flush(), and only check them once in bq_enqueue() since they are
> all modified together.
> 
> By using xdp_redirect_map in sample/bpf and send pkts via pktgen cmd:
> ./pktgen_sample03_burst_single_flow.sh -i eno1 -d $dst_ip -m $dst_mac -t 10 -s 64
> 
> There are about +/- 0.1M deviation for native testing, the performance
> improved for the base-case, but some drop back with xdp devmap prog attached.
> 
> Version          | Test                           | Generic | Native | Native + 2nd xdp_prog
> 5.10 rc6         | xdp_redirect_map   i40e->i40e  |    2.0M |   9.1M |  8.0M
> 5.10 rc6         | xdp_redirect_map   i40e->veth  |    1.7M |  11.0M |  9.7M
> 5.10 rc6 + patch | xdp_redirect_map   i40e->i40e  |    2.0M |   9.5M |  7.5M
> 5.10 rc6 + patch | xdp_redirect_map   i40e->veth  |    1.7M |  11.6M |  9.1M
> 

[...]

> +static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog,
> +				struct xdp_frame **frames, int n,
> +				struct net_device *dev)
> +{
> +	struct xdp_txq_info txq = { .dev = dev };
> +	struct xdp_buff xdp;
> +	int i, nframes = 0;
> +
> +	for (i = 0; i < n; i++) {
> +		struct xdp_frame *xdpf = frames[i];
> +		u32 act;
> +		int err;
> +
> +		xdp_convert_frame_to_buff(xdpf, &xdp);
> +		xdp.txq = &txq;
> +
> +		act = bpf_prog_run_xdp(xdp_prog, &xdp);
> +		switch (act) {
> +		case XDP_PASS:
> +			err = xdp_update_frame_from_buff(&xdp, xdpf);
> +			if (unlikely(err < 0))
> +				xdp_return_frame_rx_napi(xdpf);
> +			else
> +				frames[nframes++] = xdpf;
> +			break;
> +		default:
> +			bpf_warn_invalid_xdp_action(act);
> +			fallthrough;
> +		case XDP_ABORTED:
> +			trace_xdp_exception(dev, xdp_prog, act);
> +			fallthrough;
> +		case XDP_DROP:
> +			xdp_return_frame_rx_napi(xdpf);
> +			break;
> +		}
> +	}
> +	return nframes; /* sent frames count */
> +}
> +
>  static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
>  {
>  	struct net_device *dev = bq->dev;
> -	int sent = 0, drops = 0, err = 0;
> +	unsigned int cnt = bq->count;
> +	int drops = 0, err = 0;
> +	int to_send = cnt;
> +	int sent = cnt;
>  	int i;
>  
> -	if (unlikely(!bq->count))
> +	if (unlikely(!cnt))
>  		return;
>  
> -	for (i = 0; i < bq->count; i++) {
> +	for (i = 0; i < cnt; i++) {
>  		struct xdp_frame *xdpf = bq->q[i];
>  
>  		prefetch(xdpf);
>  	}
>  
> -	sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q, flags);
> +	if (bq->xdp_prog) {
> +		to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev);
> +		if (!to_send) {
> +			sent = 0;
> +			goto out;
> +		}
> +		drops = cnt - to_send;
> +	}

I might be missing something about how *bq works here. What happens when
dev_map_bpf_prog_run returns to_send < cnt?

So I read this as it will send [0, to_send] and [to_send, cnt] will be
dropped? How do we know the bpf prog would have dropped the set,
[to_send+1, cnt]?

> +
> +	sent = dev->netdev_ops->ndo_xdp_xmit(dev, to_send, bq->q, flags);
>  	if (sent < 0) {
>  		err = sent;
>  		sent = 0;
> -		goto error;
> +
> +		/* If ndo_xdp_xmit fails with an errno, no frames have been
> +		 * xmit'ed and it's our responsibility to them free all.
> +		 */
> +		for (i = 0; i < cnt - drops; i++) {
> +			struct xdp_frame *xdpf = bq->q[i];
> +
> +			xdp_return_frame_rx_napi(xdpf);
> +		}
>  	}
> -	drops = bq->count - sent;
>  out:
> +	drops = cnt - sent;
>  	bq->count = 0;
>  
>  	trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, drops, err);

I gather the remaining [to_send+1, cnt] packets are in fact dropped
because we set bq->count=0 and trace_xdp_devmap_xmit seems to think
they are dropped.

Also update the comment in trace_xdp_devmap_xmit,

   /* Catch API error of drv ndo_xdp_xmit sent more than count */

so that it reads to account for devmap progs as well?

> -	bq->dev_rx = NULL;
> -	__list_del_clearprev(&bq->flush_node);
>  	return;
> -error:
> -	/* If ndo_xdp_xmit fails with an errno, no frames have been
> -	 * xmit'ed and it's our responsibility to them free all.
> -	 */
> -	for (i = 0; i < bq->count; i++) {
> -		struct xdp_frame *xdpf = bq->q[i];
> -
> -		xdp_return_frame_rx_napi(xdpf);
> -		drops++;
> -	}
> -	goto out;
>  }

[...]
  
> -static struct xdp_buff *dev_map_run_prog(struct net_device *dev,
> -					 struct xdp_buff *xdp,
> -					 struct bpf_prog *xdp_prog)
> -{
> -	struct xdp_txq_info txq = { .dev = dev };
> -	u32 act;
> -
> -	xdp_set_data_meta_invalid(xdp);
> -	xdp->txq = &txq;
> -
> -	act = bpf_prog_run_xdp(xdp_prog, xdp);
> -	switch (act) {
> -	case XDP_PASS:
> -		return xdp;
> -	case XDP_DROP:
> -		break;
> -	default:
> -		bpf_warn_invalid_xdp_action(act);
> -		fallthrough;
> -	case XDP_ABORTED:
> -		trace_xdp_exception(dev, xdp_prog, act);
> -		break;
> -	}
> -
> -	xdp_return_buff(xdp);
> -	return NULL;
> -}
> -
>  int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
>  		    struct net_device *dev_rx)
>  {
> -	return __xdp_enqueue(dev, xdp, dev_rx);
> +	return __xdp_enqueue(dev, xdp, dev_rx, NULL);
>  }
>  
>  int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
> @@ -489,12 +516,7 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
>  {
>  	struct net_device *dev = dst->dev;
>  
> -	if (dst->xdp_prog) {
> -		xdp = dev_map_run_prog(dev, xdp, dst->xdp_prog);
> -		if (!xdp)
> -			return 0;

So here it looks like dev_map_run_prog will not drop extra
packets, but will see every single packet.

Are we changing the semantics subtle here? This looks like
a problem to me. We should not drop packets in the new case
unless bpf program tells us to.

> -	}
> -	return __xdp_enqueue(dev, xdp, dev_rx);
> +	return __xdp_enqueue(dev, xdp, dev_rx, dst->xdp_prog);
>  }

Did I miss something? If not maybe a comment in the commit
message would help or in the code itself.

Thanks,
John
Toke Høiland-Jørgensen Jan. 27, 2021, 9:41 a.m. UTC | #2
John Fastabend <john.fastabend@gmail.com> writes:

> Hangbin Liu wrote:
>> From: Jesper Dangaard Brouer <brouer@redhat.com>
>> 
>> This changes the devmap XDP program support to run the program when the
>> bulk queue is flushed instead of before the frame is enqueued. This has
>> a couple of benefits:
>> 
>> - It "sorts" the packets by destination devmap entry, and then runs the
>>   same BPF program on all the packets in sequence. This ensures that we
>>   keep the XDP program and destination device properties hot in I-cache.
>> 
>> - It makes the multicast implementation simpler because it can just
>>   enqueue packets using bq_enqueue() without having to deal with the
>>   devmap program at all.
>> 
>> The drawback is that if the devmap program drops the packet, the enqueue
>> step is redundant. However, arguably this is mostly visible in a
>> micro-benchmark, and with more mixed traffic the I-cache benefit should
>> win out. The performance impact of just this patch is as follows:
>> 
>> The bq_xmit_all's logic is also refactored and error label is removed.
>> When bq_xmit_all() is called from bq_enqueue(), another packet will
>> always be enqueued immediately after, so clearing dev_rx, xdp_prog and
>> flush_node in bq_xmit_all() is redundant. Let's move the clear to
>> __dev_flush(), and only check them once in bq_enqueue() since they are
>> all modified together.
>> 
>> By using xdp_redirect_map in sample/bpf and send pkts via pktgen cmd:
>> ./pktgen_sample03_burst_single_flow.sh -i eno1 -d $dst_ip -m $dst_mac -t 10 -s 64
>> 
>> There are about +/- 0.1M deviation for native testing, the performance
>> improved for the base-case, but some drop back with xdp devmap prog attached.
>> 
>> Version          | Test                           | Generic | Native | Native + 2nd xdp_prog
>> 5.10 rc6         | xdp_redirect_map   i40e->i40e  |    2.0M |   9.1M |  8.0M
>> 5.10 rc6         | xdp_redirect_map   i40e->veth  |    1.7M |  11.0M |  9.7M
>> 5.10 rc6 + patch | xdp_redirect_map   i40e->i40e  |    2.0M |   9.5M |  7.5M
>> 5.10 rc6 + patch | xdp_redirect_map   i40e->veth  |    1.7M |  11.6M |  9.1M
>> 
>
> [...]
>
>> +static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog,
>> +				struct xdp_frame **frames, int n,
>> +				struct net_device *dev)
>> +{
>> +	struct xdp_txq_info txq = { .dev = dev };
>> +	struct xdp_buff xdp;
>> +	int i, nframes = 0;
>> +
>> +	for (i = 0; i < n; i++) {
>> +		struct xdp_frame *xdpf = frames[i];
>> +		u32 act;
>> +		int err;
>> +
>> +		xdp_convert_frame_to_buff(xdpf, &xdp);
>> +		xdp.txq = &txq;
>> +
>> +		act = bpf_prog_run_xdp(xdp_prog, &xdp);
>> +		switch (act) {
>> +		case XDP_PASS:
>> +			err = xdp_update_frame_from_buff(&xdp, xdpf);
>> +			if (unlikely(err < 0))
>> +				xdp_return_frame_rx_napi(xdpf);
>> +			else
>> +				frames[nframes++] = xdpf;
>> +			break;
>> +		default:
>> +			bpf_warn_invalid_xdp_action(act);
>> +			fallthrough;
>> +		case XDP_ABORTED:
>> +			trace_xdp_exception(dev, xdp_prog, act);
>> +			fallthrough;
>> +		case XDP_DROP:
>> +			xdp_return_frame_rx_napi(xdpf);
>> +			break;
>> +		}
>> +	}
>> +	return nframes; /* sent frames count */
>> +}
>> +
>>  static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
>>  {
>>  	struct net_device *dev = bq->dev;
>> -	int sent = 0, drops = 0, err = 0;
>> +	unsigned int cnt = bq->count;
>> +	int drops = 0, err = 0;
>> +	int to_send = cnt;
>> +	int sent = cnt;
>>  	int i;
>>  
>> -	if (unlikely(!bq->count))
>> +	if (unlikely(!cnt))
>>  		return;
>>  
>> -	for (i = 0; i < bq->count; i++) {
>> +	for (i = 0; i < cnt; i++) {
>>  		struct xdp_frame *xdpf = bq->q[i];
>>  
>>  		prefetch(xdpf);
>>  	}
>>  
>> -	sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q, flags);
>> +	if (bq->xdp_prog) {
>> +		to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev);
>> +		if (!to_send) {
>> +			sent = 0;
>> +			goto out;
>> +		}
>> +		drops = cnt - to_send;
>> +	}
>
> I might be missing something about how *bq works here. What happens when
> dev_map_bpf_prog_run returns to_send < cnt?
>
> So I read this as it will send [0, to_send] and [to_send, cnt] will be
> dropped? How do we know the bpf prog would have dropped the set,
> [to_send+1, cnt]?

Because dev_map_bpf_prog_run() compacts the array:

+		case XDP_PASS:
+			err = xdp_update_frame_from_buff(&xdp, xdpf);
+			if (unlikely(err < 0))
+				xdp_return_frame_rx_napi(xdpf);
+			else
+				frames[nframes++] = xdpf;
+			break;

[...]

>>  int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
>> @@ -489,12 +516,7 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
>>  {
>>  	struct net_device *dev = dst->dev;
>>  
>> -	if (dst->xdp_prog) {
>> -		xdp = dev_map_run_prog(dev, xdp, dst->xdp_prog);
>> -		if (!xdp)
>> -			return 0;
>
> So here it looks like dev_map_run_prog will not drop extra
> packets, but will see every single packet.
>
> Are we changing the semantics subtle here? This looks like
> a problem to me. We should not drop packets in the new case
> unless bpf program tells us to.

It's not a change in semantics (see above), but I'll grant you that it's
subtle :)

-Toke
Fijalkowski, Maciej Jan. 27, 2021, 12:20 p.m. UTC | #3
On Wed, Jan 27, 2021 at 10:41:44AM +0100, Toke Høiland-Jørgensen wrote:
> John Fastabend <john.fastabend@gmail.com> writes:
> 
> > Hangbin Liu wrote:
> >> From: Jesper Dangaard Brouer <brouer@redhat.com>
> >> 
> >> This changes the devmap XDP program support to run the program when the
> >> bulk queue is flushed instead of before the frame is enqueued. This has
> >> a couple of benefits:
> >> 
> >> - It "sorts" the packets by destination devmap entry, and then runs the
> >>   same BPF program on all the packets in sequence. This ensures that we
> >>   keep the XDP program and destination device properties hot in I-cache.
> >> 
> >> - It makes the multicast implementation simpler because it can just
> >>   enqueue packets using bq_enqueue() without having to deal with the
> >>   devmap program at all.
> >> 
> >> The drawback is that if the devmap program drops the packet, the enqueue
> >> step is redundant. However, arguably this is mostly visible in a
> >> micro-benchmark, and with more mixed traffic the I-cache benefit should
> >> win out. The performance impact of just this patch is as follows:
> >> 
> >> The bq_xmit_all's logic is also refactored and error label is removed.
> >> When bq_xmit_all() is called from bq_enqueue(), another packet will
> >> always be enqueued immediately after, so clearing dev_rx, xdp_prog and
> >> flush_node in bq_xmit_all() is redundant. Let's move the clear to
> >> __dev_flush(), and only check them once in bq_enqueue() since they are
> >> all modified together.
> >> 
> >> By using xdp_redirect_map in sample/bpf and send pkts via pktgen cmd:
> >> ./pktgen_sample03_burst_single_flow.sh -i eno1 -d $dst_ip -m $dst_mac -t 10 -s 64
> >> 
> >> There are about +/- 0.1M deviation for native testing, the performance
> >> improved for the base-case, but some drop back with xdp devmap prog attached.
> >> 
> >> Version          | Test                           | Generic | Native | Native + 2nd xdp_prog
> >> 5.10 rc6         | xdp_redirect_map   i40e->i40e  |    2.0M |   9.1M |  8.0M
> >> 5.10 rc6         | xdp_redirect_map   i40e->veth  |    1.7M |  11.0M |  9.7M
> >> 5.10 rc6 + patch | xdp_redirect_map   i40e->i40e  |    2.0M |   9.5M |  7.5M
> >> 5.10 rc6 + patch | xdp_redirect_map   i40e->veth  |    1.7M |  11.6M |  9.1M
> >> 
> >
> > [...]
> >
> >> +static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog,
> >> +				struct xdp_frame **frames, int n,
> >> +				struct net_device *dev)
> >> +{
> >> +	struct xdp_txq_info txq = { .dev = dev };
> >> +	struct xdp_buff xdp;
> >> +	int i, nframes = 0;
> >> +
> >> +	for (i = 0; i < n; i++) {
> >> +		struct xdp_frame *xdpf = frames[i];
> >> +		u32 act;
> >> +		int err;
> >> +
> >> +		xdp_convert_frame_to_buff(xdpf, &xdp);
> >> +		xdp.txq = &txq;
> >> +
> >> +		act = bpf_prog_run_xdp(xdp_prog, &xdp);
> >> +		switch (act) {
> >> +		case XDP_PASS:
> >> +			err = xdp_update_frame_from_buff(&xdp, xdpf);
> >> +			if (unlikely(err < 0))
> >> +				xdp_return_frame_rx_napi(xdpf);
> >> +			else
> >> +				frames[nframes++] = xdpf;
> >> +			break;
> >> +		default:
> >> +			bpf_warn_invalid_xdp_action(act);
> >> +			fallthrough;
> >> +		case XDP_ABORTED:
> >> +			trace_xdp_exception(dev, xdp_prog, act);
> >> +			fallthrough;
> >> +		case XDP_DROP:
> >> +			xdp_return_frame_rx_napi(xdpf);
> >> +			break;
> >> +		}
> >> +	}
> >> +	return nframes; /* sent frames count */
> >> +}
> >> +
> >>  static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
> >>  {
> >>  	struct net_device *dev = bq->dev;
> >> -	int sent = 0, drops = 0, err = 0;
> >> +	unsigned int cnt = bq->count;
> >> +	int drops = 0, err = 0;
> >> +	int to_send = cnt;
> >> +	int sent = cnt;
> >>  	int i;
> >>  
> >> -	if (unlikely(!bq->count))
> >> +	if (unlikely(!cnt))
> >>  		return;
> >>  
> >> -	for (i = 0; i < bq->count; i++) {
> >> +	for (i = 0; i < cnt; i++) {
> >>  		struct xdp_frame *xdpf = bq->q[i];
> >>  
> >>  		prefetch(xdpf);
> >>  	}
> >>  
> >> -	sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q, flags);
> >> +	if (bq->xdp_prog) {
> >> +		to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev);
> >> +		if (!to_send) {
> >> +			sent = 0;
> >> +			goto out;
> >> +		}
> >> +		drops = cnt - to_send;
> >> +	}
> >
> > I might be missing something about how *bq works here. What happens when
> > dev_map_bpf_prog_run returns to_send < cnt?
> >
> > So I read this as it will send [0, to_send] and [to_send, cnt] will be
> > dropped? How do we know the bpf prog would have dropped the set,
> > [to_send+1, cnt]?

You know that via recalculation of 'drops' value after you returned from
dev_map_bpf_prog_run() which later on is provided onto trace_xdp_devmap_xmit.

> 
> Because dev_map_bpf_prog_run() compacts the array:
> 
> +		case XDP_PASS:
> +			err = xdp_update_frame_from_buff(&xdp, xdpf);
> +			if (unlikely(err < 0))
> +				xdp_return_frame_rx_napi(xdpf);
> +			else
> +				frames[nframes++] = xdpf;
> +			break;

To expand this a little, 'frames' array is reused and 'nframes' above is
the value that is returned and we store it onto 'to_send' variable.

> 
> [...]
> 
> >>  int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
> >> @@ -489,12 +516,7 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
> >>  {
> >>  	struct net_device *dev = dst->dev;
> >>  
> >> -	if (dst->xdp_prog) {
> >> -		xdp = dev_map_run_prog(dev, xdp, dst->xdp_prog);
> >> -		if (!xdp)
> >> -			return 0;
> >
> > So here it looks like dev_map_run_prog will not drop extra
> > packets, but will see every single packet.
> >
> > Are we changing the semantics subtle here? This looks like
> > a problem to me. We should not drop packets in the new case
> > unless bpf program tells us to.
> 
> It's not a change in semantics (see above), but I'll grant you that it's
> subtle :)

dev map xdp prog still sees all of the frames.

Maybe you were referring to a fact that for XDP_PASS action you might fail
with xdp->xdpf conversion?

I'm wondering if we could actually do a further optimization and avoid
xdpf/xdp juggling.

What if xdp_dev_bulk_queue would be storing the xdp_buffs instead of
xdp_frames ?

Then you hit bq_xmit_all and if prog is present it doesn't have to do that
dance like we have right now. After that you walk through xdp_buff array
and do the conversion so that xdp_frame array will be passed do
ndo_xdp_xmit.

I had a bad sleep so maybe I'm talking nonsense over here, will take
another look in the evening though :)


> 
> -Toke
>
Jesper Dangaard Brouer Jan. 27, 2021, 3 p.m. UTC | #4
On Wed, 27 Jan 2021 13:20:50 +0100
Maciej Fijalkowski <maciej.fijalkowski@intel.com> wrote:

> On Wed, Jan 27, 2021 at 10:41:44AM +0100, Toke Høiland-Jørgensen wrote:
> > John Fastabend <john.fastabend@gmail.com> writes:
> >   
> > > Hangbin Liu wrote:  
> > >> From: Jesper Dangaard Brouer <brouer@redhat.com>
> > >> 
> > >> This changes the devmap XDP program support to run the program when the
> > >> bulk queue is flushed instead of before the frame is enqueued. This has
> > >> a couple of benefits:
> > >> 
> > >> - It "sorts" the packets by destination devmap entry, and then runs the
> > >>   same BPF program on all the packets in sequence. This ensures that we
> > >>   keep the XDP program and destination device properties hot in I-cache.
> > >> 
> > >> - It makes the multicast implementation simpler because it can just
> > >>   enqueue packets using bq_enqueue() without having to deal with the
> > >>   devmap program at all.
> > >> 
> > >> The drawback is that if the devmap program drops the packet, the enqueue
> > >> step is redundant. However, arguably this is mostly visible in a
> > >> micro-benchmark, and with more mixed traffic the I-cache benefit should
> > >> win out. The performance impact of just this patch is as follows:
> > >> 
> > >> The bq_xmit_all's logic is also refactored and error label is removed.
> > >> When bq_xmit_all() is called from bq_enqueue(), another packet will
> > >> always be enqueued immediately after, so clearing dev_rx, xdp_prog and
> > >> flush_node in bq_xmit_all() is redundant. Let's move the clear to
> > >> __dev_flush(), and only check them once in bq_enqueue() since they are
> > >> all modified together.
> > >> 
> > >> By using xdp_redirect_map in sample/bpf and send pkts via pktgen cmd:
> > >> ./pktgen_sample03_burst_single_flow.sh -i eno1 -d $dst_ip -m $dst_mac -t 10 -s 64
> > >> 
> > >> There are about +/- 0.1M deviation for native testing, the performance
> > >> improved for the base-case, but some drop back with xdp devmap prog attached.
> > >> 
> > >> Version          | Test                           | Generic | Native | Native + 2nd xdp_prog
> > >> 5.10 rc6         | xdp_redirect_map   i40e->i40e  |    2.0M |   9.1M |  8.0M
> > >> 5.10 rc6         | xdp_redirect_map   i40e->veth  |    1.7M |  11.0M |  9.7M
> > >> 5.10 rc6 + patch | xdp_redirect_map   i40e->i40e  |    2.0M |   9.5M |  7.5M
> > >> 5.10 rc6 + patch | xdp_redirect_map   i40e->veth  |    1.7M |  11.6M |  9.1M
> > >>   
> > >
> > > [...]
> > >  
> > >> +static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog,
> > >> +				struct xdp_frame **frames, int n,
> > >> +				struct net_device *dev)
> > >> +{
> > >> +	struct xdp_txq_info txq = { .dev = dev };
> > >> +	struct xdp_buff xdp;
> > >> +	int i, nframes = 0;
> > >> +
> > >> +	for (i = 0; i < n; i++) {
> > >> +		struct xdp_frame *xdpf = frames[i];
> > >> +		u32 act;
> > >> +		int err;
> > >> +
> > >> +		xdp_convert_frame_to_buff(xdpf, &xdp);
> > >> +		xdp.txq = &txq;
> > >> +
> > >> +		act = bpf_prog_run_xdp(xdp_prog, &xdp);
> > >> +		switch (act) {
> > >> +		case XDP_PASS:
> > >> +			err = xdp_update_frame_from_buff(&xdp, xdpf);
> > >> +			if (unlikely(err < 0))
> > >> +				xdp_return_frame_rx_napi(xdpf);
> > >> +			else
> > >> +				frames[nframes++] = xdpf;
> > >> +			break;
> > >> +		default:
> > >> +			bpf_warn_invalid_xdp_action(act);
> > >> +			fallthrough;
> > >> +		case XDP_ABORTED:
> > >> +			trace_xdp_exception(dev, xdp_prog, act);
> > >> +			fallthrough;
> > >> +		case XDP_DROP:
> > >> +			xdp_return_frame_rx_napi(xdpf);
> > >> +			break;
> > >> +		}
> > >> +	}
> > >> +	return nframes; /* sent frames count */
> > >> +}
> > >> +
> > >>  static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
> > >>  {
> > >>  	struct net_device *dev = bq->dev;
> > >> -	int sent = 0, drops = 0, err = 0;
> > >> +	unsigned int cnt = bq->count;
> > >> +	int drops = 0, err = 0;
> > >> +	int to_send = cnt;
> > >> +	int sent = cnt;
> > >>  	int i;
> > >>  
> > >> -	if (unlikely(!bq->count))
> > >> +	if (unlikely(!cnt))
> > >>  		return;
> > >>  
> > >> -	for (i = 0; i < bq->count; i++) {
> > >> +	for (i = 0; i < cnt; i++) {
> > >>  		struct xdp_frame *xdpf = bq->q[i];
> > >>  
> > >>  		prefetch(xdpf);
> > >>  	}
> > >>  
> > >> -	sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q, flags);
> > >> +	if (bq->xdp_prog) {
> > >> +		to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev);
> > >> +		if (!to_send) {
> > >> +			sent = 0;
> > >> +			goto out;
> > >> +		}
> > >> +		drops = cnt - to_send;
> > >> +	}  
> > >
> > > I might be missing something about how *bq works here. What happens when
> > > dev_map_bpf_prog_run returns to_send < cnt?
> > >
> > > So I read this as it will send [0, to_send] and [to_send, cnt] will be
> > > dropped? How do we know the bpf prog would have dropped the set,
> > > [to_send+1, cnt]?  
> 
> You know that via recalculation of 'drops' value after you returned from
> dev_map_bpf_prog_run() which later on is provided onto trace_xdp_devmap_xmit.
> 
> > 
> > Because dev_map_bpf_prog_run() compacts the array:
> > 
> > +		case XDP_PASS:
> > +			err = xdp_update_frame_from_buff(&xdp, xdpf);
> > +			if (unlikely(err < 0))
> > +				xdp_return_frame_rx_napi(xdpf);
> > +			else
> > +				frames[nframes++] = xdpf;
> > +			break;  
> 
> To expand this a little, 'frames' array is reused and 'nframes' above is
> the value that is returned and we store it onto 'to_send' variable.
> 
> > 
> > [...]
> >   
> > >>  int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
> > >> @@ -489,12 +516,7 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
> > >>  {
> > >>  	struct net_device *dev = dst->dev;
> > >>  
> > >> -	if (dst->xdp_prog) {
> > >> -		xdp = dev_map_run_prog(dev, xdp, dst->xdp_prog);
> > >> -		if (!xdp)
> > >> -			return 0;  
> > >
> > > So here it looks like dev_map_run_prog will not drop extra
> > > packets, but will see every single packet.
> > >
> > > Are we changing the semantics subtle here? This looks like
> > > a problem to me. We should not drop packets in the new case
> > > unless bpf program tells us to.  
> > 
> > It's not a change in semantics (see above), but I'll grant you that it's
> > subtle :)  
> 
> dev map xdp prog still sees all of the frames.
> 
> Maybe you were referring to a fact that for XDP_PASS action you might fail
> with xdp->xdpf conversion?
> 
> I'm wondering if we could actually do a further optimization and avoid
> xdpf/xdp juggling.
> 
> What if xdp_dev_bulk_queue would be storing the xdp_buffs instead of
> xdp_frames ?

Not possible. Remember that struct xdp_buff is "allocated" on the call
stack.  Thus, you cannot store a pointer to the xdp_buffs in
xdp_dev_bulk_queue.

The xdp_frame also avoids allocation, via using memory placed in top of
data-frame.  Thus, you can store a pointer to the xdp_frame, as it is
actually backed by real memory. 

See[1] slide-11 ("Fundamental structs")

> Then you hit bq_xmit_all and if prog is present it doesn't have to do that
> dance like we have right now. After that you walk through xdp_buff array
> and do the conversion so that xdp_frame array will be passed do
> ndo_xdp_xmit.

If you want to performance optimize this, I suggest that we detect if
we need to call xdp_update_frame_from_buff(&xdp, xdpf) after the 2nd
XDP-prog ran.  In many case the BPF-prog don't move head/tail/metadata,
so that call becomes unnecessary.

 
> I had a bad sleep so maybe I'm talking nonsense over here, will take
> another look in the evening though :)

:)

[1] https://people.netfilter.org/hawk/presentations/KernelRecipes2019/xdp-netstack-concert.pdf
John Fastabend Jan. 27, 2021, 3:52 p.m. UTC | #5
Jesper Dangaard Brouer wrote:
> On Wed, 27 Jan 2021 13:20:50 +0100
> Maciej Fijalkowski <maciej.fijalkowski@intel.com> wrote:
> 
> > On Wed, Jan 27, 2021 at 10:41:44AM +0100, Toke Høiland-Jørgensen wrote:
> > > John Fastabend <john.fastabend@gmail.com> writes:
> > >   
> > > > Hangbin Liu wrote:  
> > > >> From: Jesper Dangaard Brouer <brouer@redhat.com>
> > > >> 
> > > >> This changes the devmap XDP program support to run the program when the
> > > >> bulk queue is flushed instead of before the frame is enqueued. This has
> > > >> a couple of benefits:
> > > >> 
> > > >> - It "sorts" the packets by destination devmap entry, and then runs the
> > > >>   same BPF program on all the packets in sequence. This ensures that we
> > > >>   keep the XDP program and destination device properties hot in I-cache.
> > > >> 
> > > >> - It makes the multicast implementation simpler because it can just
> > > >>   enqueue packets using bq_enqueue() without having to deal with the
> > > >>   devmap program at all.
> > > >> 
> > > >> The drawback is that if the devmap program drops the packet, the enqueue
> > > >> step is redundant. However, arguably this is mostly visible in a
> > > >> micro-benchmark, and with more mixed traffic the I-cache benefit should
> > > >> win out. The performance impact of just this patch is as follows:
> > > >> 
> > > >> The bq_xmit_all's logic is also refactored and error label is removed.
> > > >> When bq_xmit_all() is called from bq_enqueue(), another packet will
> > > >> always be enqueued immediately after, so clearing dev_rx, xdp_prog and
> > > >> flush_node in bq_xmit_all() is redundant. Let's move the clear to
> > > >> __dev_flush(), and only check them once in bq_enqueue() since they are
> > > >> all modified together.
> > > >> 
> > > >> By using xdp_redirect_map in sample/bpf and send pkts via pktgen cmd:
> > > >> ./pktgen_sample03_burst_single_flow.sh -i eno1 -d $dst_ip -m $dst_mac -t 10 -s 64
> > > >> 
> > > >> There are about +/- 0.1M deviation for native testing, the performance
> > > >> improved for the base-case, but some drop back with xdp devmap prog attached.
> > > >> 
> > > >> Version          | Test                           | Generic | Native | Native + 2nd xdp_prog
> > > >> 5.10 rc6         | xdp_redirect_map   i40e->i40e  |    2.0M |   9.1M |  8.0M
> > > >> 5.10 rc6         | xdp_redirect_map   i40e->veth  |    1.7M |  11.0M |  9.7M
> > > >> 5.10 rc6 + patch | xdp_redirect_map   i40e->i40e  |    2.0M |   9.5M |  7.5M
> > > >> 5.10 rc6 + patch | xdp_redirect_map   i40e->veth  |    1.7M |  11.6M |  9.1M
> > > >>   
> > > >
> > > > [...]

Acked-by: John Fastabend <john.fastabend@gmail.com>

> > > >>  static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
> > > >>  {
> > > >>  	struct net_device *dev = bq->dev;
> > > >> -	int sent = 0, drops = 0, err = 0;
> > > >> +	unsigned int cnt = bq->count;
> > > >> +	int drops = 0, err = 0;
> > > >> +	int to_send = cnt;
> > > >> +	int sent = cnt;
> > > >>  	int i;
> > > >>  
> > > >> -	if (unlikely(!bq->count))
> > > >> +	if (unlikely(!cnt))
> > > >>  		return;
> > > >>  
> > > >> -	for (i = 0; i < bq->count; i++) {
> > > >> +	for (i = 0; i < cnt; i++) {
> > > >>  		struct xdp_frame *xdpf = bq->q[i];
> > > >>  
> > > >>  		prefetch(xdpf);
> > > >>  	}
> > > >>  
> > > >> -	sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q, flags);
> > > >> +	if (bq->xdp_prog) {
> > > >> +		to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev);
> > > >> +		if (!to_send) {
> > > >> +			sent = 0;
> > > >> +			goto out;
> > > >> +		}
> > > >> +		drops = cnt - to_send;
> > > >> +	}  
> > > >
> > > > I might be missing something about how *bq works here. What happens when
> > > > dev_map_bpf_prog_run returns to_send < cnt?
> > > >
> > > > So I read this as it will send [0, to_send] and [to_send, cnt] will be
> > > > dropped? How do we know the bpf prog would have dropped the set,
> > > > [to_send+1, cnt]?  
> > 
> > You know that via recalculation of 'drops' value after you returned from
> > dev_map_bpf_prog_run() which later on is provided onto trace_xdp_devmap_xmit.
> > 
> > > 
> > > Because dev_map_bpf_prog_run() compacts the array:
> > > 
> > > +		case XDP_PASS:
> > > +			err = xdp_update_frame_from_buff(&xdp, xdpf);
> > > +			if (unlikely(err < 0))
> > > +				xdp_return_frame_rx_napi(xdpf);
> > > +			else
> > > +				frames[nframes++] = xdpf;
> > > +			break;  
> > 
> > To expand this a little, 'frames' array is reused and 'nframes' above is
> > the value that is returned and we store it onto 'to_send' variable.
> > 

In the morning with coffee looks good to me. Thanks Toke, Jesper.
Jesper Dangaard Brouer Jan. 27, 2021, 3:58 p.m. UTC | #6
On Mon, 25 Jan 2021 20:45:11 +0800
Hangbin Liu <liuhangbin@gmail.com> wrote:

> By using xdp_redirect_map in sample/bpf and send pkts via pktgen cmd:
> ./pktgen_sample03_burst_single_flow.sh -i eno1 -d $dst_ip -m $dst_mac -t 10 -s 64
> 
> There are about +/- 0.1M deviation for native testing, the performance
> improved for the base-case, but some drop back with xdp devmap prog attached.
> 
> Version          | Test                           | Generic | Native | Native + 2nd xdp_prog
> 5.10 rc6         | xdp_redirect_map   i40e->i40e  |    2.0M |   9.1M |  8.0M
> 5.10 rc6         | xdp_redirect_map   i40e->veth  |    1.7M |  11.0M |  9.7M
> 5.10 rc6 + patch | xdp_redirect_map   i40e->i40e  |    2.0M |   9.5M |  7.5M
> 5.10 rc6 + patch | xdp_redirect_map   i40e->veth  |    1.7M |  11.6M |  9.1M

I want to highlight the improvement 9.1M -> 9.5M.  This is the native
(40e->i40e) case where the isn't any "2nd xdp_prog".

This means that when we introduced the "2nd xdp_prog", we lost a little
performance without noticing (death-by-a-1000-paper-cuts), for the
baseline case where this feature is not used/activated.

This patch regains that performance for our baseline.  That in itself
make this patch worth it.
Toke Høiland-Jørgensen Jan. 27, 2021, 4:05 p.m. UTC | #7
John Fastabend <john.fastabend@gmail.com> writes:

> Jesper Dangaard Brouer wrote:
>> On Wed, 27 Jan 2021 13:20:50 +0100
>> Maciej Fijalkowski <maciej.fijalkowski@intel.com> wrote:
>> 
>> > On Wed, Jan 27, 2021 at 10:41:44AM +0100, Toke Høiland-Jørgensen wrote:
>> > > John Fastabend <john.fastabend@gmail.com> writes:
>> > >   
>> > > > Hangbin Liu wrote:  
>> > > >> From: Jesper Dangaard Brouer <brouer@redhat.com>
>> > > >> 
>> > > >> This changes the devmap XDP program support to run the program when the
>> > > >> bulk queue is flushed instead of before the frame is enqueued. This has
>> > > >> a couple of benefits:
>> > > >> 
>> > > >> - It "sorts" the packets by destination devmap entry, and then runs the
>> > > >>   same BPF program on all the packets in sequence. This ensures that we
>> > > >>   keep the XDP program and destination device properties hot in I-cache.
>> > > >> 
>> > > >> - It makes the multicast implementation simpler because it can just
>> > > >>   enqueue packets using bq_enqueue() without having to deal with the
>> > > >>   devmap program at all.
>> > > >> 
>> > > >> The drawback is that if the devmap program drops the packet, the enqueue
>> > > >> step is redundant. However, arguably this is mostly visible in a
>> > > >> micro-benchmark, and with more mixed traffic the I-cache benefit should
>> > > >> win out. The performance impact of just this patch is as follows:
>> > > >> 
>> > > >> The bq_xmit_all's logic is also refactored and error label is removed.
>> > > >> When bq_xmit_all() is called from bq_enqueue(), another packet will
>> > > >> always be enqueued immediately after, so clearing dev_rx, xdp_prog and
>> > > >> flush_node in bq_xmit_all() is redundant. Let's move the clear to
>> > > >> __dev_flush(), and only check them once in bq_enqueue() since they are
>> > > >> all modified together.
>> > > >> 
>> > > >> By using xdp_redirect_map in sample/bpf and send pkts via pktgen cmd:
>> > > >> ./pktgen_sample03_burst_single_flow.sh -i eno1 -d $dst_ip -m $dst_mac -t 10 -s 64
>> > > >> 
>> > > >> There are about +/- 0.1M deviation for native testing, the performance
>> > > >> improved for the base-case, but some drop back with xdp devmap prog attached.
>> > > >> 
>> > > >> Version          | Test                           | Generic | Native | Native + 2nd xdp_prog
>> > > >> 5.10 rc6         | xdp_redirect_map   i40e->i40e  |    2.0M |   9.1M |  8.0M
>> > > >> 5.10 rc6         | xdp_redirect_map   i40e->veth  |    1.7M |  11.0M |  9.7M
>> > > >> 5.10 rc6 + patch | xdp_redirect_map   i40e->i40e  |    2.0M |   9.5M |  7.5M
>> > > >> 5.10 rc6 + patch | xdp_redirect_map   i40e->veth  |    1.7M |  11.6M |  9.1M
>> > > >>   
>> > > >
>> > > > [...]
>
> Acked-by: John Fastabend <john.fastabend@gmail.com>
>
>> > > >>  static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
>> > > >>  {
>> > > >>  	struct net_device *dev = bq->dev;
>> > > >> -	int sent = 0, drops = 0, err = 0;
>> > > >> +	unsigned int cnt = bq->count;
>> > > >> +	int drops = 0, err = 0;
>> > > >> +	int to_send = cnt;
>> > > >> +	int sent = cnt;
>> > > >>  	int i;
>> > > >>  
>> > > >> -	if (unlikely(!bq->count))
>> > > >> +	if (unlikely(!cnt))
>> > > >>  		return;
>> > > >>  
>> > > >> -	for (i = 0; i < bq->count; i++) {
>> > > >> +	for (i = 0; i < cnt; i++) {
>> > > >>  		struct xdp_frame *xdpf = bq->q[i];
>> > > >>  
>> > > >>  		prefetch(xdpf);
>> > > >>  	}
>> > > >>  
>> > > >> -	sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q, flags);
>> > > >> +	if (bq->xdp_prog) {
>> > > >> +		to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev);
>> > > >> +		if (!to_send) {
>> > > >> +			sent = 0;
>> > > >> +			goto out;
>> > > >> +		}
>> > > >> +		drops = cnt - to_send;
>> > > >> +	}  
>> > > >
>> > > > I might be missing something about how *bq works here. What happens when
>> > > > dev_map_bpf_prog_run returns to_send < cnt?
>> > > >
>> > > > So I read this as it will send [0, to_send] and [to_send, cnt] will be
>> > > > dropped? How do we know the bpf prog would have dropped the set,
>> > > > [to_send+1, cnt]?  
>> > 
>> > You know that via recalculation of 'drops' value after you returned from
>> > dev_map_bpf_prog_run() which later on is provided onto trace_xdp_devmap_xmit.
>> > 
>> > > 
>> > > Because dev_map_bpf_prog_run() compacts the array:
>> > > 
>> > > +		case XDP_PASS:
>> > > +			err = xdp_update_frame_from_buff(&xdp, xdpf);
>> > > +			if (unlikely(err < 0))
>> > > +				xdp_return_frame_rx_napi(xdpf);
>> > > +			else
>> > > +				frames[nframes++] = xdpf;
>> > > +			break;  
>> > 
>> > To expand this a little, 'frames' array is reused and 'nframes' above is
>> > the value that is returned and we store it onto 'to_send' variable.
>> > 
>
> In the morning with coffee looks good to me. Thanks Toke, Jesper.

Haha, yeah, coffee does tend to help, doesn't it? You're welcome :)

-Toke
diff mbox series

Patch

diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index f6e9c68afdd4..bf8b6b5c9cab 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -57,6 +57,7 @@  struct xdp_dev_bulk_queue {
 	struct list_head flush_node;
 	struct net_device *dev;
 	struct net_device *dev_rx;
+	struct bpf_prog *xdp_prog;
 	unsigned int count;
 };
 
@@ -327,46 +328,92 @@  bool dev_map_can_have_prog(struct bpf_map *map)
 	return false;
 }
 
+static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog,
+				struct xdp_frame **frames, int n,
+				struct net_device *dev)
+{
+	struct xdp_txq_info txq = { .dev = dev };
+	struct xdp_buff xdp;
+	int i, nframes = 0;
+
+	for (i = 0; i < n; i++) {
+		struct xdp_frame *xdpf = frames[i];
+		u32 act;
+		int err;
+
+		xdp_convert_frame_to_buff(xdpf, &xdp);
+		xdp.txq = &txq;
+
+		act = bpf_prog_run_xdp(xdp_prog, &xdp);
+		switch (act) {
+		case XDP_PASS:
+			err = xdp_update_frame_from_buff(&xdp, xdpf);
+			if (unlikely(err < 0))
+				xdp_return_frame_rx_napi(xdpf);
+			else
+				frames[nframes++] = xdpf;
+			break;
+		default:
+			bpf_warn_invalid_xdp_action(act);
+			fallthrough;
+		case XDP_ABORTED:
+			trace_xdp_exception(dev, xdp_prog, act);
+			fallthrough;
+		case XDP_DROP:
+			xdp_return_frame_rx_napi(xdpf);
+			break;
+		}
+	}
+	return nframes; /* sent frames count */
+}
+
 static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
 {
 	struct net_device *dev = bq->dev;
-	int sent = 0, drops = 0, err = 0;
+	unsigned int cnt = bq->count;
+	int drops = 0, err = 0;
+	int to_send = cnt;
+	int sent = cnt;
 	int i;
 
-	if (unlikely(!bq->count))
+	if (unlikely(!cnt))
 		return;
 
-	for (i = 0; i < bq->count; i++) {
+	for (i = 0; i < cnt; i++) {
 		struct xdp_frame *xdpf = bq->q[i];
 
 		prefetch(xdpf);
 	}
 
-	sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q, flags);
+	if (bq->xdp_prog) {
+		to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev);
+		if (!to_send) {
+			sent = 0;
+			goto out;
+		}
+		drops = cnt - to_send;
+	}
+
+	sent = dev->netdev_ops->ndo_xdp_xmit(dev, to_send, bq->q, flags);
 	if (sent < 0) {
 		err = sent;
 		sent = 0;
-		goto error;
+
+		/* If ndo_xdp_xmit fails with an errno, no frames have been
+		 * xmit'ed and it's our responsibility to them free all.
+		 */
+		for (i = 0; i < cnt - drops; i++) {
+			struct xdp_frame *xdpf = bq->q[i];
+
+			xdp_return_frame_rx_napi(xdpf);
+		}
 	}
-	drops = bq->count - sent;
 out:
+	drops = cnt - sent;
 	bq->count = 0;
 
 	trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, drops, err);
-	bq->dev_rx = NULL;
-	__list_del_clearprev(&bq->flush_node);
 	return;
-error:
-	/* If ndo_xdp_xmit fails with an errno, no frames have been
-	 * xmit'ed and it's our responsibility to them free all.
-	 */
-	for (i = 0; i < bq->count; i++) {
-		struct xdp_frame *xdpf = bq->q[i];
-
-		xdp_return_frame_rx_napi(xdpf);
-		drops++;
-	}
-	goto out;
 }
 
 /* __dev_flush is called from xdp_do_flush() which _must_ be signaled
@@ -384,8 +431,12 @@  void __dev_flush(void)
 	struct list_head *flush_list = this_cpu_ptr(&dev_flush_list);
 	struct xdp_dev_bulk_queue *bq, *tmp;
 
-	list_for_each_entry_safe(bq, tmp, flush_list, flush_node)
+	list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
 		bq_xmit_all(bq, XDP_XMIT_FLUSH);
+		bq->dev_rx = NULL;
+		bq->xdp_prog = NULL;
+		__list_del_clearprev(&bq->flush_node);
+	}
 }
 
 /* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or
@@ -408,7 +459,7 @@  struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
  * Thus, safe percpu variable access.
  */
 static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
-		       struct net_device *dev_rx)
+		       struct net_device *dev_rx, struct bpf_prog *xdp_prog)
 {
 	struct list_head *flush_list = this_cpu_ptr(&dev_flush_list);
 	struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq);
@@ -419,18 +470,22 @@  static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
 	/* Ingress dev_rx will be the same for all xdp_frame's in
 	 * bulk_queue, because bq stored per-CPU and must be flushed
 	 * from net_device drivers NAPI func end.
+	 *
+	 * Do the same with xdp_prog and flush_list since these fields
+	 * are only ever modified together.
 	 */
-	if (!bq->dev_rx)
+	if (!bq->dev_rx) {
 		bq->dev_rx = dev_rx;
+		bq->xdp_prog = xdp_prog;
+		list_add(&bq->flush_node, flush_list);
+	}
 
 	bq->q[bq->count++] = xdpf;
-
-	if (!bq->flush_node.prev)
-		list_add(&bq->flush_node, flush_list);
 }
 
 static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
-			       struct net_device *dev_rx)
+				struct net_device *dev_rx,
+				struct bpf_prog *xdp_prog)
 {
 	struct xdp_frame *xdpf;
 	int err;
@@ -446,42 +501,14 @@  static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
 	if (unlikely(!xdpf))
 		return -EOVERFLOW;
 
-	bq_enqueue(dev, xdpf, dev_rx);
+	bq_enqueue(dev, xdpf, dev_rx, xdp_prog);
 	return 0;
 }
 
-static struct xdp_buff *dev_map_run_prog(struct net_device *dev,
-					 struct xdp_buff *xdp,
-					 struct bpf_prog *xdp_prog)
-{
-	struct xdp_txq_info txq = { .dev = dev };
-	u32 act;
-
-	xdp_set_data_meta_invalid(xdp);
-	xdp->txq = &txq;
-
-	act = bpf_prog_run_xdp(xdp_prog, xdp);
-	switch (act) {
-	case XDP_PASS:
-		return xdp;
-	case XDP_DROP:
-		break;
-	default:
-		bpf_warn_invalid_xdp_action(act);
-		fallthrough;
-	case XDP_ABORTED:
-		trace_xdp_exception(dev, xdp_prog, act);
-		break;
-	}
-
-	xdp_return_buff(xdp);
-	return NULL;
-}
-
 int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
 		    struct net_device *dev_rx)
 {
-	return __xdp_enqueue(dev, xdp, dev_rx);
+	return __xdp_enqueue(dev, xdp, dev_rx, NULL);
 }
 
 int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
@@ -489,12 +516,7 @@  int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
 {
 	struct net_device *dev = dst->dev;
 
-	if (dst->xdp_prog) {
-		xdp = dev_map_run_prog(dev, xdp, dst->xdp_prog);
-		if (!xdp)
-			return 0;
-	}
-	return __xdp_enqueue(dev, xdp, dev_rx);
+	return __xdp_enqueue(dev, xdp, dev_rx, dst->xdp_prog);
 }
 
 int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,