diff mbox series

[RFC,3/3] ublk_drv: add ebpf support

Message ID 20230215004122.28917-4-xiaoguang.wang@linux.alibaba.com (mailing list archive)
State New, archived
Headers show
Series Add io_uring & ebpf based methods to implement zero-copy for ublk | expand

Commit Message

Xiaoguang Wang Feb. 15, 2023, 12:41 a.m. UTC
Currenly only one bpf_ublk_queue_sqe() ebpf is added, ublksrv target
can use this helper to write ebpf prog to support ublk kernel & usersapce
zero copy, please see ublksrv test codes for more info.

Signed-off-by: Xiaoguang Wang <xiaoguang.wang@linux.alibaba.com>
---
 drivers/block/ublk_drv.c       | 207 ++++++++++++++++++++++++++++++++-
 include/uapi/linux/bpf.h       |   1 +
 include/uapi/linux/ublk_cmd.h  |  11 ++
 scripts/bpf_doc.py             |   4 +
 tools/include/uapi/linux/bpf.h |   8 ++
 5 files changed, 229 insertions(+), 2 deletions(-)

Comments

Ming Lei Feb. 16, 2023, 8:11 a.m. UTC | #1
On Wed, Feb 15, 2023 at 08:41:22AM +0800, Xiaoguang Wang wrote:
> Currenly only one bpf_ublk_queue_sqe() ebpf is added, ublksrv target
> can use this helper to write ebpf prog to support ublk kernel & usersapce
> zero copy, please see ublksrv test codes for more info.
> 
> Signed-off-by: Xiaoguang Wang <xiaoguang.wang@linux.alibaba.com>
> ---
>  drivers/block/ublk_drv.c       | 207 ++++++++++++++++++++++++++++++++-
>  include/uapi/linux/bpf.h       |   1 +
>  include/uapi/linux/ublk_cmd.h  |  11 ++
>  scripts/bpf_doc.py             |   4 +
>  tools/include/uapi/linux/bpf.h |   8 ++
>  5 files changed, 229 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
> index b628e9eaefa6..44c289b72864 100644
> --- a/drivers/block/ublk_drv.c
> +++ b/drivers/block/ublk_drv.c
> @@ -61,6 +61,7 @@
>  struct ublk_rq_data {
>  	struct llist_node node;
>  	struct callback_head work;
> +	struct io_mapped_kbuf *kbuf;
>  };
>  
>  struct ublk_uring_cmd_pdu {
> @@ -163,6 +164,9 @@ struct ublk_device {
>  	unsigned int		nr_queues_ready;
>  	atomic_t		nr_aborted_queues;
>  
> +	struct bpf_prog		*io_prep_prog;
> +	struct bpf_prog		*io_submit_prog;
> +
>  	/*
>  	 * Our ubq->daemon may be killed without any notification, so
>  	 * monitor each queue's daemon periodically
> @@ -189,10 +193,46 @@ static DEFINE_MUTEX(ublk_ctl_mutex);
>  
>  static struct miscdevice ublk_misc;
>  
> +struct ublk_io_bpf_ctx {
> +	struct ublk_bpf_ctx ctx;
> +	struct ublk_device *ub;
> +	struct callback_head work;
> +};
> +
> +BPF_CALL_4(bpf_ublk_queue_sqe, struct ublk_io_bpf_ctx *, bpf_ctx,
> +	   struct io_uring_sqe *, sqe, u32, sqe_len, u32, fd)
> +{
> +	struct request *rq;
> +	struct ublk_rq_data *data;
> +	struct io_mapped_kbuf *kbuf;
> +	u16 q_id = bpf_ctx->ctx.q_id;
> +	u16 tag = bpf_ctx->ctx.tag;
> +
> +	rq = blk_mq_tag_to_rq(bpf_ctx->ub->tag_set.tags[q_id], tag);
> +	data = blk_mq_rq_to_pdu(rq);
> +	kbuf = data->kbuf;
> +	io_uring_submit_sqe(fd, sqe, sqe_len, kbuf);
> +	return 0;
> +}
> +
> +const struct bpf_func_proto ublk_bpf_queue_sqe_proto = {
> +	.func = bpf_ublk_queue_sqe,
> +	.gpl_only = false,
> +	.ret_type = RET_INTEGER,
> +	.arg1_type = ARG_ANYTHING,
> +	.arg2_type = ARG_ANYTHING,
> +	.arg3_type = ARG_ANYTHING,
> +};
> +
>  static const struct bpf_func_proto *
>  ublk_bpf_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
>  {
> -	return bpf_base_func_proto(func_id);
> +	switch (func_id) {
> +	case BPF_FUNC_ublk_queue_sqe:
> +		return &ublk_bpf_queue_sqe_proto;
> +	default:
> +		return bpf_base_func_proto(func_id);
> +	}
>  }
>  
>  static bool ublk_bpf_is_valid_access(int off, int size,
> @@ -200,6 +240,23 @@ static bool ublk_bpf_is_valid_access(int off, int size,
>  			const struct bpf_prog *prog,
>  			struct bpf_insn_access_aux *info)
>  {
> +	if (off < 0 || off >= sizeof(struct ublk_bpf_ctx))
> +		return false;
> +	if (off % size != 0)
> +		return false;
> +
> +	switch (off) {
> +	case offsetof(struct ublk_bpf_ctx, q_id):
> +		return size == sizeof_field(struct ublk_bpf_ctx, q_id);
> +	case offsetof(struct ublk_bpf_ctx, tag):
> +		return size == sizeof_field(struct ublk_bpf_ctx, tag);
> +	case offsetof(struct ublk_bpf_ctx, op):
> +		return size == sizeof_field(struct ublk_bpf_ctx, op);
> +	case offsetof(struct ublk_bpf_ctx, nr_sectors):
> +		return size == sizeof_field(struct ublk_bpf_ctx, nr_sectors);
> +	case offsetof(struct ublk_bpf_ctx, start_sector):
> +		return size == sizeof_field(struct ublk_bpf_ctx, start_sector);
> +	}
>  	return false;
>  }
>  
> @@ -324,7 +381,7 @@ static void ublk_put_device(struct ublk_device *ub)
>  static inline struct ublk_queue *ublk_get_queue(struct ublk_device *dev,
>  		int qid)
>  {
> -       return (struct ublk_queue *)&(dev->__queues[qid * dev->queue_size]);
> +	return (struct ublk_queue *)&(dev->__queues[qid * dev->queue_size]);
>  }
>  
>  static inline bool ublk_rq_has_data(const struct request *rq)
> @@ -492,12 +549,16 @@ static inline int ublk_copy_user_pages(struct ublk_map_data *data,
>  static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req,
>  		struct ublk_io *io)
>  {
> +	struct ublk_device *ub = ubq->dev;
>  	const unsigned int rq_bytes = blk_rq_bytes(req);
>  	/*
>  	 * no zero copy, we delay copy WRITE request data into ublksrv
>  	 * context and the big benefit is that pinning pages in current
>  	 * context is pretty fast, see ublk_pin_user_pages
>  	 */
> +	if ((req_op(req) == REQ_OP_WRITE) && ub->io_prep_prog)
> +		return rq_bytes;

Can you explain a bit why READ isn't supported? Because WRITE zero
copy is supposed to be supported easily with splice based approach,
and I am more interested in READ zc actually.

> +
>  	if (req_op(req) != REQ_OP_WRITE && req_op(req) != REQ_OP_FLUSH)
>  		return rq_bytes;
>  
> @@ -860,6 +921,89 @@ static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq)
>  	}
>  }
>  
> +static void ublk_bpf_io_submit_fn(struct callback_head *work)
> +{
> +	struct ublk_io_bpf_ctx *bpf_ctx = container_of(work,
> +			struct ublk_io_bpf_ctx, work);
> +
> +	if (bpf_ctx->ub->io_submit_prog)
> +		bpf_prog_run_pin_on_cpu(bpf_ctx->ub->io_submit_prog, bpf_ctx);
> +	kfree(bpf_ctx);
> +}
> +
> +static int ublk_init_uring_kbuf(struct request *rq)
> +{
> +	struct bio_vec *bvec;
> +	struct req_iterator rq_iter;
> +	struct bio_vec tmp;
> +	int nr_bvec = 0;
> +	struct io_mapped_kbuf *kbuf;
> +	struct ublk_rq_data *data = blk_mq_rq_to_pdu(rq);
> +
> +	/* Drop previous allocation */
> +	if (data->kbuf) {
> +		kfree(data->kbuf->bvec);
> +		kfree(data->kbuf);
> +		data->kbuf = NULL;
> +	}
> +
> +	kbuf = kmalloc(sizeof(struct io_mapped_kbuf), GFP_NOIO);
> +	if (!kbuf)
> +		return -EIO;
> +
> +	rq_for_each_bvec(tmp, rq, rq_iter)
> +		nr_bvec++;
> +
> +	bvec = kmalloc_array(nr_bvec, sizeof(struct bio_vec), GFP_NOIO);
> +	if (!bvec) {
> +		kfree(kbuf);
> +		return -EIO;
> +	}
> +	kbuf->bvec = bvec;
> +	rq_for_each_bvec(tmp, rq, rq_iter) {
> +		*bvec = tmp;
> +		bvec++;
> +	}
> +
> +	kbuf->count = blk_rq_bytes(rq);
> +	kbuf->nr_bvecs = nr_bvec;
> +	data->kbuf = kbuf;
> +	return 0;

bio/req bvec table is immutable, so here you can pass its reference
to kbuf directly.

> +}
> +
> +static int ublk_run_bpf_prog(struct ublk_queue *ubq, struct request *rq)
> +{
> +	int err;
> +	struct ublk_device *ub = ubq->dev;
> +	struct bpf_prog *prog = ub->io_prep_prog;
> +	struct ublk_io_bpf_ctx *bpf_ctx;
> +
> +	if (!prog)
> +		return 0;
> +
> +	bpf_ctx = kmalloc(sizeof(struct ublk_io_bpf_ctx), GFP_NOIO);
> +	if (!bpf_ctx)
> +		return -EIO;
> +
> +	err = ublk_init_uring_kbuf(rq);
> +	if (err < 0) {
> +		kfree(bpf_ctx);
> +		return -EIO;
> +	}
> +	bpf_ctx->ub = ub;
> +	bpf_ctx->ctx.q_id = ubq->q_id;
> +	bpf_ctx->ctx.tag = rq->tag;
> +	bpf_ctx->ctx.op = req_op(rq);
> +	bpf_ctx->ctx.nr_sectors = blk_rq_sectors(rq);
> +	bpf_ctx->ctx.start_sector = blk_rq_pos(rq);

The above is for setting up target io parameter, which is supposed
to be from userspace, cause it is result of user space logic. If
these parameters are from kernel, the whole logic has to be done
in io_prep_prog.

> +	bpf_prog_run_pin_on_cpu(prog, bpf_ctx);
> +
> +	init_task_work(&bpf_ctx->work, ublk_bpf_io_submit_fn);
> +	if (task_work_add(ubq->ubq_daemon, &bpf_ctx->work, TWA_SIGNAL_NO_IPI))
> +		kfree(bpf_ctx);

task_work_add() is only available in case of ublk builtin.

> +	return 0;
> +}
> +
>  static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
>  		const struct blk_mq_queue_data *bd)
>  {
> @@ -872,6 +1016,9 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
>  	if (unlikely(res != BLK_STS_OK))
>  		return BLK_STS_IOERR;
>  
> +	/* Currently just for test. */
> +	ublk_run_bpf_prog(ubq, rq);

Can you explain the above comment a bit? When is the io_prep_prog called
in the non-test version? Or can you post the non-test version in list
for review.

Here it is the key for understanding the whole idea, especially when
is io_prep_prog called finally? How to pass parameters to io_prep_prog?

Given it is ebpf prog, I don't think any userspace parameter can be
passed to io_prep_prog when submitting IO, that means all user logic has
to be done inside io_prep_prog? If yes, not sure if it is one good way,
cause ebpf prog is very limited programming environment, but the user
logic could be as complicated as using btree to map io, or communicating
with remote machine for figuring out the mapping. Loop is just the
simplest direct mapping.


Thanks, 
Ming
Xiaoguang Wang Feb. 16, 2023, 12:12 p.m. UTC | #2
hello,

> On Wed, Feb 15, 2023 at 08:41:22AM +0800, Xiaoguang Wang wrote:
>> Currenly only one bpf_ublk_queue_sqe() ebpf is added, ublksrv target
>> can use this helper to write ebpf prog to support ublk kernel & usersapce
>> zero copy, please see ublksrv test codes for more info.
>>
>>  	 */
>> +	if ((req_op(req) == REQ_OP_WRITE) && ub->io_prep_prog)
>> +		return rq_bytes;
> Can you explain a bit why READ isn't supported? Because WRITE zero
> copy is supposed to be supported easily with splice based approach,
> and I am more interested in READ zc actually.
No special reason, READ op can also be supported. I'll
add this support in patch set v2.
For this RFC patch set, I just tried to show the idea, so
I must admit that current codes are not mature enough :)

>
>> +
>>  	if (req_op(req) != REQ_OP_WRITE && req_op(req) != REQ_OP_FLUSH)
>>  		return rq_bytes;
>>  
>> @@ -860,6 +921,89 @@ static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq)
>>  	}
>>  }
>>  
>>
>> +	kbuf->bvec = bvec;
>> +	rq_for_each_bvec(tmp, rq, rq_iter) {
>> +		*bvec = tmp;
>> +		bvec++;
>> +	}
>> +
>> +	kbuf->count = blk_rq_bytes(rq);
>> +	kbuf->nr_bvecs = nr_bvec;
>> +	data->kbuf = kbuf;
>> +	return 0;
> bio/req bvec table is immutable, so here you can pass its reference
> to kbuf directly.
Yeah, thanks.

>
>> +}
>> +
>> +static int ublk_run_bpf_prog(struct ublk_queue *ubq, struct request *rq)
>> +{
>> +	int err;
>> +	struct ublk_device *ub = ubq->dev;
>> +	struct bpf_prog *prog = ub->io_prep_prog;
>> +	struct ublk_io_bpf_ctx *bpf_ctx;
>> +
>> +	if (!prog)
>> +		return 0;
>> +
>> +	bpf_ctx = kmalloc(sizeof(struct ublk_io_bpf_ctx), GFP_NOIO);
>> +	if (!bpf_ctx)
>> +		return -EIO;
>> +
>> +	err = ublk_init_uring_kbuf(rq);
>> +	if (err < 0) {
>> +		kfree(bpf_ctx);
>> +		return -EIO;
>> +	}
>> +	bpf_ctx->ub = ub;
>> +	bpf_ctx->ctx.q_id = ubq->q_id;
>> +	bpf_ctx->ctx.tag = rq->tag;
>> +	bpf_ctx->ctx.op = req_op(rq);
>> +	bpf_ctx->ctx.nr_sectors = blk_rq_sectors(rq);
>> +	bpf_ctx->ctx.start_sector = blk_rq_pos(rq);
> The above is for setting up target io parameter, which is supposed
> to be from userspace, cause it is result of user space logic. If
> these parameters are from kernel, the whole logic has to be done
> in io_prep_prog.
Yeah, it's designed that io_prep_prog implements user space
io logic.

>
>> +	bpf_prog_run_pin_on_cpu(prog, bpf_ctx);
>> +
>> +	init_task_work(&bpf_ctx->work, ublk_bpf_io_submit_fn);
>> +	if (task_work_add(ubq->ubq_daemon, &bpf_ctx->work, TWA_SIGNAL_NO_IPI))
>> +		kfree(bpf_ctx);
> task_work_add() is only available in case of ublk builtin.
Yeah, I'm thinking how to work around it.

>
>> +	return 0;
>> +}
>> +
>>  static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
>>  		const struct blk_mq_queue_data *bd)
>>  {
>> @@ -872,6 +1016,9 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
>>  	if (unlikely(res != BLK_STS_OK))
>>  		return BLK_STS_IOERR;
>>  
>> +	/* Currently just for test. */
>> +	ublk_run_bpf_prog(ubq, rq);
> Can you explain the above comment a bit? When is the io_prep_prog called
> in the non-test version? Or can you post the non-test version in list
> for review.
Forgot to delete stale comments, sorry. I'm writing v2 patch set,

> Here it is the key for understanding the whole idea, especially when
> is io_prep_prog called finally? How to pass parameters to io_prep_prog?
Let me explain more about the design:
io_prep_prog has two types of parameters:
1) its call argument: struct ublk_bpf_ctx, see ublk.bpf.c.
ublk_bpf_ctx will describe one kernel io requests about
its op, qid, sectors info. io_prep_prog uses these info to
map target io.
2) ebpf map structure, user space daemon can use map
structure to pass much information from user space to
io_prep_prog, which will help it to initialize target io if necessary.

io_prep_prog is called when ublk_queue_rq() is called, this bpf
prog will initialize one or more sqes according to user logic, and
io_prep_prog will put these sqes in an ebpf map structure, then
execute a task_work_add() to notify ubq_daemon to execute
io_submit_prog. Note, we can not call io_uring_submit_sqe()
in task context that calls ublk_queue_rq(), that context does not
have io_uring instance owned by ubq_daemon.
Later ubq_daemon will call io_submit_prog to submit sqes.

>
> Given it is ebpf prog, I don't think any userspace parameter can be
> passed to io_prep_prog when submitting IO, that means all user logic has
> to be done inside io_prep_prog? If yes, not sure if it is one good way,
> cause ebpf prog is very limited programming environment, but the user
> logic could be as complicated as using btree to map io, or communicating
> with remote machine for figuring out the mapping. Loop is just the
> simplest direct mapping.
Currently, we can use ebpf map structure to pass user space
parameter to io_prep_prog. Also I agree with you that complicated
logic maybe hard to be implemented in ebpf prog, hope ebpf
community will improve this situation gradually.

For userspace target implementations I met so far, they just use
userspace block device solutions to visit distributed filesystem,
involves socket programming and have simple map io logic. We
can prepare socket fd in ebpf map structure, and these map io
logic should be easily implemented in ebpf prog, though I don't
apply this ebpf method to our internal business yet.

Thanks for review.

Regards,
Xiaoguang Wang

>
>
> Thanks, 
> Ming
Ming Lei Feb. 17, 2023, 3:02 a.m. UTC | #3
On Thu, Feb 16, 2023 at 08:12:18PM +0800, Xiaoguang Wang wrote:
> hello,
> 
> > On Wed, Feb 15, 2023 at 08:41:22AM +0800, Xiaoguang Wang wrote:
> >> Currenly only one bpf_ublk_queue_sqe() ebpf is added, ublksrv target
> >> can use this helper to write ebpf prog to support ublk kernel & usersapce
> >> zero copy, please see ublksrv test codes for more info.
> >>
> >>  	 */
> >> +	if ((req_op(req) == REQ_OP_WRITE) && ub->io_prep_prog)
> >> +		return rq_bytes;
> > Can you explain a bit why READ isn't supported? Because WRITE zero
> > copy is supposed to be supported easily with splice based approach,
> > and I am more interested in READ zc actually.
> No special reason, READ op can also be supported. I'll
> add this support in patch set v2.
> For this RFC patch set, I just tried to show the idea, so
> I must admit that current codes are not mature enough :)

OK.

> 
> >
> >> +
> >>  	if (req_op(req) != REQ_OP_WRITE && req_op(req) != REQ_OP_FLUSH)
> >>  		return rq_bytes;
> >>  
> >> @@ -860,6 +921,89 @@ static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq)
> >>  	}
> >>  }
> >>  
> >>
> >> +	kbuf->bvec = bvec;
> >> +	rq_for_each_bvec(tmp, rq, rq_iter) {
> >> +		*bvec = tmp;
> >> +		bvec++;
> >> +	}
> >> +
> >> +	kbuf->count = blk_rq_bytes(rq);
> >> +	kbuf->nr_bvecs = nr_bvec;
> >> +	data->kbuf = kbuf;
> >> +	return 0;
> > bio/req bvec table is immutable, so here you can pass its reference
> > to kbuf directly.
> Yeah, thanks.

Also if this request has multiple bios, either you need to submit
multple sqes or copy all bvec into single table. And in case of single bio,
the table reference can be used directly.

> 
> >
> >> +}
> >> +
> >> +static int ublk_run_bpf_prog(struct ublk_queue *ubq, struct request *rq)
> >> +{
> >> +	int err;
> >> +	struct ublk_device *ub = ubq->dev;
> >> +	struct bpf_prog *prog = ub->io_prep_prog;
> >> +	struct ublk_io_bpf_ctx *bpf_ctx;
> >> +
> >> +	if (!prog)
> >> +		return 0;
> >> +
> >> +	bpf_ctx = kmalloc(sizeof(struct ublk_io_bpf_ctx), GFP_NOIO);
> >> +	if (!bpf_ctx)
> >> +		return -EIO;
> >> +
> >> +	err = ublk_init_uring_kbuf(rq);
> >> +	if (err < 0) {
> >> +		kfree(bpf_ctx);
> >> +		return -EIO;
> >> +	}
> >> +	bpf_ctx->ub = ub;
> >> +	bpf_ctx->ctx.q_id = ubq->q_id;
> >> +	bpf_ctx->ctx.tag = rq->tag;
> >> +	bpf_ctx->ctx.op = req_op(rq);
> >> +	bpf_ctx->ctx.nr_sectors = blk_rq_sectors(rq);
> >> +	bpf_ctx->ctx.start_sector = blk_rq_pos(rq);
> > The above is for setting up target io parameter, which is supposed
> > to be from userspace, cause it is result of user space logic. If
> > these parameters are from kernel, the whole logic has to be done
> > in io_prep_prog.
> Yeah, it's designed that io_prep_prog implements user space
> io logic.

That could be the biggest weakness of this approach, because people
really want to implement complicated logic in userspace, which should
be the biggest value of ublk, but now seems you move kernel C
programming into ebpf userspace programming, I don't think ebpf
is good at handling complicated userspace logic.

> 
> >
> >> +	bpf_prog_run_pin_on_cpu(prog, bpf_ctx);
> >> +
> >> +	init_task_work(&bpf_ctx->work, ublk_bpf_io_submit_fn);
> >> +	if (task_work_add(ubq->ubq_daemon, &bpf_ctx->work, TWA_SIGNAL_NO_IPI))
> >> +		kfree(bpf_ctx);
> > task_work_add() is only available in case of ublk builtin.
> Yeah, I'm thinking how to work around it.
> 
> >
> >> +	return 0;
> >> +}
> >> +
> >>  static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
> >>  		const struct blk_mq_queue_data *bd)
> >>  {
> >> @@ -872,6 +1016,9 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
> >>  	if (unlikely(res != BLK_STS_OK))
> >>  		return BLK_STS_IOERR;
> >>  
> >> +	/* Currently just for test. */
> >> +	ublk_run_bpf_prog(ubq, rq);
> > Can you explain the above comment a bit? When is the io_prep_prog called
> > in the non-test version? Or can you post the non-test version in list
> > for review.
> Forgot to delete stale comments, sorry. I'm writing v2 patch set,

OK, got it, so looks ublk_run_bpf_prog is designed to run two progs
loaded from two control commands.

> 
> > Here it is the key for understanding the whole idea, especially when
> > is io_prep_prog called finally? How to pass parameters to io_prep_prog?
> Let me explain more about the design:
> io_prep_prog has two types of parameters:
> 1) its call argument: struct ublk_bpf_ctx, see ublk.bpf.c.
> ublk_bpf_ctx will describe one kernel io requests about
> its op, qid, sectors info. io_prep_prog uses these info to
> map target io.
> 2) ebpf map structure, user space daemon can use map
> structure to pass much information from user space to
> io_prep_prog, which will help it to initialize target io if necessary.
> 
> io_prep_prog is called when ublk_queue_rq() is called, this bpf
> prog will initialize one or more sqes according to user logic, and
> io_prep_prog will put these sqes in an ebpf map structure, then
> execute a task_work_add() to notify ubq_daemon to execute
> io_submit_prog. Note, we can not call io_uring_submit_sqe()
> in task context that calls ublk_queue_rq(), that context does not
> have io_uring instance owned by ubq_daemon.
> Later ubq_daemon will call io_submit_prog to submit sqes.

Submitting sqe from kernel looks interesting, but I guess
performance may be hurt, given plugging(batching) can't be applied
any more, which is supposed to affect io perf a lot.



Thanks,
Ming
Ming Lei Feb. 17, 2023, 10:46 a.m. UTC | #4
On Fri, Feb 17, 2023 at 11:02:14AM +0800, Ming Lei wrote:
> On Thu, Feb 16, 2023 at 08:12:18PM +0800, Xiaoguang Wang wrote:
> > hello,

...

> > io_prep_prog is called when ublk_queue_rq() is called, this bpf
> > prog will initialize one or more sqes according to user logic, and
> > io_prep_prog will put these sqes in an ebpf map structure, then
> > execute a task_work_add() to notify ubq_daemon to execute
> > io_submit_prog. Note, we can not call io_uring_submit_sqe()
> > in task context that calls ublk_queue_rq(), that context does not
> > have io_uring instance owned by ubq_daemon.
> > Later ubq_daemon will call io_submit_prog to submit sqes.
> 
> Submitting sqe from kernel looks interesting, but I guess
> performance may be hurt, given plugging(batching) can't be applied
> any more, which is supposed to affect io perf a lot.

If submitting SQE in kernel is really doable, maybe we can add another
command, such as, UBLK_IO_SUBMIT_SQE(just like UBLK_IO_NEED_GET_DATA),
and pass the built SQE(which represents part of user logic result) as
io_uring command payload, and ask ublk driver to build buffer for this
SQE, then submit this SQE in kernel.

But there is SQE order problem, net usually requires SQEs to be linked
and submitted in order, with this way, it becomes not easy to maintain
SQEs order(some linked in user, and some in kernel).

Thanks,
Ming
Xiaoguang Wang Feb. 22, 2023, 2:13 p.m. UTC | #5
hi,

I spent some time to write v2, especially think about how to work
around task_work_add is not exported, so sorry for late response.
>
>>> The above is for setting up target io parameter, which is supposed
>>> to be from userspace, cause it is result of user space logic. If
>>> these parameters are from kernel, the whole logic has to be done
>>> in io_prep_prog.
>> Yeah, it's designed that io_prep_prog implements user space
>> io logic.
> That could be the biggest weakness of this approach, because people
> really want to implement complicated logic in userspace, which should
> be the biggest value of ublk, but now seems you move kernel C
> programming into ebpf userspace programming, I don't think ebpf
> is good at handling complicated userspace logic.
Absolutely agree with you, ebpf has strict programming rules,
I spent more time than I had thought at startup for support loop
target ebpf prog(ublk.bpf.c). Later I'll try to collaborate with my
colleagues, to see whether we can program their userspace logic
into ebpf prog or partially.
 
>> io_prep_prog is called when ublk_queue_rq() is called, this bpf
>> prog will initialize one or more sqes according to user logic, and
>> io_prep_prog will put these sqes in an ebpf map structure, then
>> execute a task_work_add() to notify ubq_daemon to execute
>> io_submit_prog. Note, we can not call io_uring_submit_sqe()
>> in task context that calls ublk_queue_rq(), that context does not
>> have io_uring instance owned by ubq_daemon.
>> Later ubq_daemon will call io_submit_prog to submit sqes.
> Submitting sqe from kernel looks interesting, but I guess
> performance may be hurt, given plugging(batching) can't be applied
> any more, which is supposed to affect io perf a lot.
Yes, agree, but I didn't have much time to improve this yet.
Currently, I mainly try to use this feature on large ios, to
reduce memory copy overhead, which consumes much
cpu resource, our clients really hope we can reduce it.

Regards,
Xiaoguang Wang

>
>
>
> Thanks,
> Ming
diff mbox series

Patch

diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index b628e9eaefa6..44c289b72864 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -61,6 +61,7 @@ 
 struct ublk_rq_data {
 	struct llist_node node;
 	struct callback_head work;
+	struct io_mapped_kbuf *kbuf;
 };
 
 struct ublk_uring_cmd_pdu {
@@ -163,6 +164,9 @@  struct ublk_device {
 	unsigned int		nr_queues_ready;
 	atomic_t		nr_aborted_queues;
 
+	struct bpf_prog		*io_prep_prog;
+	struct bpf_prog		*io_submit_prog;
+
 	/*
 	 * Our ubq->daemon may be killed without any notification, so
 	 * monitor each queue's daemon periodically
@@ -189,10 +193,46 @@  static DEFINE_MUTEX(ublk_ctl_mutex);
 
 static struct miscdevice ublk_misc;
 
+struct ublk_io_bpf_ctx {
+	struct ublk_bpf_ctx ctx;
+	struct ublk_device *ub;
+	struct callback_head work;
+};
+
+BPF_CALL_4(bpf_ublk_queue_sqe, struct ublk_io_bpf_ctx *, bpf_ctx,
+	   struct io_uring_sqe *, sqe, u32, sqe_len, u32, fd)
+{
+	struct request *rq;
+	struct ublk_rq_data *data;
+	struct io_mapped_kbuf *kbuf;
+	u16 q_id = bpf_ctx->ctx.q_id;
+	u16 tag = bpf_ctx->ctx.tag;
+
+	rq = blk_mq_tag_to_rq(bpf_ctx->ub->tag_set.tags[q_id], tag);
+	data = blk_mq_rq_to_pdu(rq);
+	kbuf = data->kbuf;
+	io_uring_submit_sqe(fd, sqe, sqe_len, kbuf);
+	return 0;
+}
+
+const struct bpf_func_proto ublk_bpf_queue_sqe_proto = {
+	.func = bpf_ublk_queue_sqe,
+	.gpl_only = false,
+	.ret_type = RET_INTEGER,
+	.arg1_type = ARG_ANYTHING,
+	.arg2_type = ARG_ANYTHING,
+	.arg3_type = ARG_ANYTHING,
+};
+
 static const struct bpf_func_proto *
 ublk_bpf_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
-	return bpf_base_func_proto(func_id);
+	switch (func_id) {
+	case BPF_FUNC_ublk_queue_sqe:
+		return &ublk_bpf_queue_sqe_proto;
+	default:
+		return bpf_base_func_proto(func_id);
+	}
 }
 
 static bool ublk_bpf_is_valid_access(int off, int size,
@@ -200,6 +240,23 @@  static bool ublk_bpf_is_valid_access(int off, int size,
 			const struct bpf_prog *prog,
 			struct bpf_insn_access_aux *info)
 {
+	if (off < 0 || off >= sizeof(struct ublk_bpf_ctx))
+		return false;
+	if (off % size != 0)
+		return false;
+
+	switch (off) {
+	case offsetof(struct ublk_bpf_ctx, q_id):
+		return size == sizeof_field(struct ublk_bpf_ctx, q_id);
+	case offsetof(struct ublk_bpf_ctx, tag):
+		return size == sizeof_field(struct ublk_bpf_ctx, tag);
+	case offsetof(struct ublk_bpf_ctx, op):
+		return size == sizeof_field(struct ublk_bpf_ctx, op);
+	case offsetof(struct ublk_bpf_ctx, nr_sectors):
+		return size == sizeof_field(struct ublk_bpf_ctx, nr_sectors);
+	case offsetof(struct ublk_bpf_ctx, start_sector):
+		return size == sizeof_field(struct ublk_bpf_ctx, start_sector);
+	}
 	return false;
 }
 
@@ -324,7 +381,7 @@  static void ublk_put_device(struct ublk_device *ub)
 static inline struct ublk_queue *ublk_get_queue(struct ublk_device *dev,
 		int qid)
 {
-       return (struct ublk_queue *)&(dev->__queues[qid * dev->queue_size]);
+	return (struct ublk_queue *)&(dev->__queues[qid * dev->queue_size]);
 }
 
 static inline bool ublk_rq_has_data(const struct request *rq)
@@ -492,12 +549,16 @@  static inline int ublk_copy_user_pages(struct ublk_map_data *data,
 static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req,
 		struct ublk_io *io)
 {
+	struct ublk_device *ub = ubq->dev;
 	const unsigned int rq_bytes = blk_rq_bytes(req);
 	/*
 	 * no zero copy, we delay copy WRITE request data into ublksrv
 	 * context and the big benefit is that pinning pages in current
 	 * context is pretty fast, see ublk_pin_user_pages
 	 */
+	if ((req_op(req) == REQ_OP_WRITE) && ub->io_prep_prog)
+		return rq_bytes;
+
 	if (req_op(req) != REQ_OP_WRITE && req_op(req) != REQ_OP_FLUSH)
 		return rq_bytes;
 
@@ -860,6 +921,89 @@  static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq)
 	}
 }
 
+static void ublk_bpf_io_submit_fn(struct callback_head *work)
+{
+	struct ublk_io_bpf_ctx *bpf_ctx = container_of(work,
+			struct ublk_io_bpf_ctx, work);
+
+	if (bpf_ctx->ub->io_submit_prog)
+		bpf_prog_run_pin_on_cpu(bpf_ctx->ub->io_submit_prog, bpf_ctx);
+	kfree(bpf_ctx);
+}
+
+static int ublk_init_uring_kbuf(struct request *rq)
+{
+	struct bio_vec *bvec;
+	struct req_iterator rq_iter;
+	struct bio_vec tmp;
+	int nr_bvec = 0;
+	struct io_mapped_kbuf *kbuf;
+	struct ublk_rq_data *data = blk_mq_rq_to_pdu(rq);
+
+	/* Drop previous allocation */
+	if (data->kbuf) {
+		kfree(data->kbuf->bvec);
+		kfree(data->kbuf);
+		data->kbuf = NULL;
+	}
+
+	kbuf = kmalloc(sizeof(struct io_mapped_kbuf), GFP_NOIO);
+	if (!kbuf)
+		return -EIO;
+
+	rq_for_each_bvec(tmp, rq, rq_iter)
+		nr_bvec++;
+
+	bvec = kmalloc_array(nr_bvec, sizeof(struct bio_vec), GFP_NOIO);
+	if (!bvec) {
+		kfree(kbuf);
+		return -EIO;
+	}
+	kbuf->bvec = bvec;
+	rq_for_each_bvec(tmp, rq, rq_iter) {
+		*bvec = tmp;
+		bvec++;
+	}
+
+	kbuf->count = blk_rq_bytes(rq);
+	kbuf->nr_bvecs = nr_bvec;
+	data->kbuf = kbuf;
+	return 0;
+}
+
+static int ublk_run_bpf_prog(struct ublk_queue *ubq, struct request *rq)
+{
+	int err;
+	struct ublk_device *ub = ubq->dev;
+	struct bpf_prog *prog = ub->io_prep_prog;
+	struct ublk_io_bpf_ctx *bpf_ctx;
+
+	if (!prog)
+		return 0;
+
+	bpf_ctx = kmalloc(sizeof(struct ublk_io_bpf_ctx), GFP_NOIO);
+	if (!bpf_ctx)
+		return -EIO;
+
+	err = ublk_init_uring_kbuf(rq);
+	if (err < 0) {
+		kfree(bpf_ctx);
+		return -EIO;
+	}
+	bpf_ctx->ub = ub;
+	bpf_ctx->ctx.q_id = ubq->q_id;
+	bpf_ctx->ctx.tag = rq->tag;
+	bpf_ctx->ctx.op = req_op(rq);
+	bpf_ctx->ctx.nr_sectors = blk_rq_sectors(rq);
+	bpf_ctx->ctx.start_sector = blk_rq_pos(rq);
+	bpf_prog_run_pin_on_cpu(prog, bpf_ctx);
+
+	init_task_work(&bpf_ctx->work, ublk_bpf_io_submit_fn);
+	if (task_work_add(ubq->ubq_daemon, &bpf_ctx->work, TWA_SIGNAL_NO_IPI))
+		kfree(bpf_ctx);
+	return 0;
+}
+
 static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
 		const struct blk_mq_queue_data *bd)
 {
@@ -872,6 +1016,9 @@  static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
 	if (unlikely(res != BLK_STS_OK))
 		return BLK_STS_IOERR;
 
+	/* Currently just for test. */
+	ublk_run_bpf_prog(ubq, rq);
+
 	/* With recovery feature enabled, force_abort is set in
 	 * ublk_stop_dev() before calling del_gendisk(). We have to
 	 * abort all requeued and new rqs here to let del_gendisk()
@@ -2009,6 +2156,56 @@  static int ublk_ctrl_end_recovery(struct io_uring_cmd *cmd)
 	return ret;
 }
 
+static int ublk_ctrl_reg_bpf_prog(struct io_uring_cmd *cmd)
+{
+	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
+	struct ublk_device *ub;
+	struct bpf_prog *prog;
+	int ret = 0;
+
+	ub = ublk_get_device_from_id(header->dev_id);
+	if (!ub)
+		return -EINVAL;
+
+	mutex_lock(&ub->mutex);
+	prog = bpf_prog_get_type(header->data[0], BPF_PROG_TYPE_UBLK);
+	if (IS_ERR(prog)) {
+		ret = PTR_ERR(prog);
+		goto out_unlock;
+	}
+	ub->io_prep_prog = prog;
+
+	prog = bpf_prog_get_type(header->data[1], BPF_PROG_TYPE_UBLK);
+	if (IS_ERR(prog)) {
+		ret = PTR_ERR(prog);
+		goto out_unlock;
+	}
+	ub->io_submit_prog = prog;
+
+out_unlock:
+	mutex_unlock(&ub->mutex);
+	ublk_put_device(ub);
+	return ret;
+}
+
+static int ublk_ctrl_unreg_bpf_prog(struct io_uring_cmd *cmd)
+{
+	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
+	struct ublk_device *ub;
+
+	ub = ublk_get_device_from_id(header->dev_id);
+	if (!ub)
+		return -EINVAL;
+
+	mutex_lock(&ub->mutex);
+	bpf_prog_put(ub->io_prep_prog);
+	bpf_prog_put(ub->io_submit_prog);
+	ub->io_prep_prog = NULL;
+	ub->io_submit_prog = NULL;
+	mutex_unlock(&ub->mutex);
+	ublk_put_device(ub);
+	return 0;
+}
 static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
 		unsigned int issue_flags)
 {
@@ -2059,6 +2256,12 @@  static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
 	case UBLK_CMD_END_USER_RECOVERY:
 		ret = ublk_ctrl_end_recovery(cmd);
 		break;
+	case UBLK_CMD_REG_BPF_PROG:
+		ret = ublk_ctrl_reg_bpf_prog(cmd);
+		break;
+	case UBLK_CMD_UNREG_BPF_PROG:
+		ret = ublk_ctrl_unreg_bpf_prog(cmd);
+		break;
 	default:
 		break;
 	}
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 515b7b995b3a..578d65e9f30e 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -5699,6 +5699,7 @@  union bpf_attr {
 	FN(user_ringbuf_drain, 209, ##ctx)		\
 	FN(cgrp_storage_get, 210, ##ctx)		\
 	FN(cgrp_storage_delete, 211, ##ctx)		\
+	FN(ublk_queue_sqe, 212, ##ctx)			\
 	/* */
 
 /* backwards-compatibility macros for users of __BPF_FUNC_MAPPER that don't
diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
index 8f88e3a29998..a43b1864de51 100644
--- a/include/uapi/linux/ublk_cmd.h
+++ b/include/uapi/linux/ublk_cmd.h
@@ -17,6 +17,8 @@ 
 #define	UBLK_CMD_STOP_DEV	0x07
 #define	UBLK_CMD_SET_PARAMS	0x08
 #define	UBLK_CMD_GET_PARAMS	0x09
+#define UBLK_CMD_REG_BPF_PROG		0x0a
+#define UBLK_CMD_UNREG_BPF_PROG		0x0b
 #define	UBLK_CMD_START_USER_RECOVERY	0x10
 #define	UBLK_CMD_END_USER_RECOVERY	0x11
 /*
@@ -230,4 +232,13 @@  struct ublk_params {
 	struct ublk_param_discard	discard;
 };
 
+struct ublk_bpf_ctx {
+	__u32	t_val;
+	__u16	q_id;
+	__u16	tag;
+	__u8	op;
+	__u32	nr_sectors;
+	__u64	start_sector;
+};
+
 #endif
diff --git a/scripts/bpf_doc.py b/scripts/bpf_doc.py
index e8d90829f23e..f8672294e145 100755
--- a/scripts/bpf_doc.py
+++ b/scripts/bpf_doc.py
@@ -700,6 +700,8 @@  class PrinterHelpers(Printer):
             'struct bpf_dynptr',
             'struct iphdr',
             'struct ipv6hdr',
+            'struct ublk_io_bpf_ctx',
+            'struct io_uring_sqe',
     ]
     known_types = {
             '...',
@@ -755,6 +757,8 @@  class PrinterHelpers(Printer):
             'const struct bpf_dynptr',
             'struct iphdr',
             'struct ipv6hdr',
+            'struct ublk_io_bpf_ctx',
+            'struct io_uring_sqe',
     }
     mapped_types = {
             'u8': '__u8',
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 515b7b995b3a..530094246e2a 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -5485,6 +5485,13 @@  union bpf_attr {
  *		0 on success.
  *
  *		**-ENOENT** if the bpf_local_storage cannot be found.
+ *
+ * u64 bpf_ublk_queue_sqe(struct ublk_io_bpf_ctx *ctx, struct io_uring_sqe *sqe, u32 offset, u32 len)
+ *	Description
+ *		Submit ublk io requests.
+ *	Return
+ *		0 on success.
+ *
  */
 #define ___BPF_FUNC_MAPPER(FN, ctx...)			\
 	FN(unspec, 0, ##ctx)				\
@@ -5699,6 +5706,7 @@  union bpf_attr {
 	FN(user_ringbuf_drain, 209, ##ctx)		\
 	FN(cgrp_storage_get, 210, ##ctx)		\
 	FN(cgrp_storage_delete, 211, ##ctx)		\
+	FN(ublk_queue_sqe, 212, ##ctx)			\
 	/* */
 
 /* backwards-compatibility macros for users of __BPF_FUNC_MAPPER that don't