Message ID | 20241016185252.3746190-11-dw@davidwei.uk (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | io_uring zero copy rx | expand |
On 10/16/24 12:52 PM, David Wei wrote: > +static int io_zcrx_create_area(struct io_ring_ctx *ctx, > + struct io_zcrx_ifq *ifq, > + struct io_zcrx_area **res, > + struct io_uring_zcrx_area_reg *area_reg) > +{ > + struct io_zcrx_area *area; > + int i, ret, nr_pages; > + struct iovec iov; > + > + if (area_reg->flags || area_reg->rq_area_token) > + return -EINVAL; > + if (area_reg->__resv1 || area_reg->__resv2[0] || area_reg->__resv2[1]) > + return -EINVAL; > + if (area_reg->addr & ~PAGE_MASK || area_reg->len & ~PAGE_MASK) > + return -EINVAL; > + > + iov.iov_base = u64_to_user_ptr(area_reg->addr); > + iov.iov_len = area_reg->len; > + ret = io_buffer_validate(&iov); > + if (ret) > + return ret; > + > + ret = -ENOMEM; > + area = kzalloc(sizeof(*area), GFP_KERNEL); > + if (!area) > + goto err; > + > + area->pages = io_pin_pages((unsigned long)area_reg->addr, area_reg->len, > + &nr_pages); > + if (IS_ERR(area->pages)) { > + ret = PTR_ERR(area->pages); > + area->pages = NULL; > + goto err; > + } > + area->nia.num_niovs = nr_pages; > + > + area->nia.niovs = kvmalloc_array(nr_pages, sizeof(area->nia.niovs[0]), > + GFP_KERNEL | __GFP_ZERO); > + if (!area->nia.niovs) > + goto err; > + > + area->freelist = kvmalloc_array(nr_pages, sizeof(area->freelist[0]), > + GFP_KERNEL | __GFP_ZERO); > + if (!area->freelist) > + goto err; > + > + for (i = 0; i < nr_pages; i++) { > + area->freelist[i] = i; > + } > + > + area->free_count = nr_pages; > + area->ifq = ifq; > + /* we're only supporting one area per ifq for now */ > + area->area_id = 0; > + area_reg->rq_area_token = (u64)area->area_id << IORING_ZCRX_AREA_SHIFT; > + spin_lock_init(&area->freelist_lock); > + *res = area; > + return 0; > +err: > + if (area) > + io_zcrx_free_area(area); > + return ret; > +} Minor nit, but I think this would be nicer returning area and just using ERR_PTR() for the errors. Outside of that: Reviewed-by: Jens Axboe <axboe@kernel.dk>
On 10/21/24 16:35, Jens Axboe wrote: > On 10/16/24 12:52 PM, David Wei wrote: >> +static int io_zcrx_create_area(struct io_ring_ctx *ctx, >> + struct io_zcrx_ifq *ifq, >> + struct io_zcrx_area **res, >> + struct io_uring_zcrx_area_reg *area_reg) >> +{ >> + struct io_zcrx_area *area; >> + int i, ret, nr_pages; >> + struct iovec iov; >> + >> + if (area_reg->flags || area_reg->rq_area_token) >> + return -EINVAL; >> + if (area_reg->__resv1 || area_reg->__resv2[0] || area_reg->__resv2[1]) >> + return -EINVAL; >> + if (area_reg->addr & ~PAGE_MASK || area_reg->len & ~PAGE_MASK) >> + return -EINVAL; >> + >> + iov.iov_base = u64_to_user_ptr(area_reg->addr); >> + iov.iov_len = area_reg->len; >> + ret = io_buffer_validate(&iov); >> + if (ret) >> + return ret; >> + >> + ret = -ENOMEM; >> + area = kzalloc(sizeof(*area), GFP_KERNEL); >> + if (!area) >> + goto err; >> + >> + area->pages = io_pin_pages((unsigned long)area_reg->addr, area_reg->len, >> + &nr_pages); >> + if (IS_ERR(area->pages)) { >> + ret = PTR_ERR(area->pages); >> + area->pages = NULL; >> + goto err; >> + } >> + area->nia.num_niovs = nr_pages; >> + >> + area->nia.niovs = kvmalloc_array(nr_pages, sizeof(area->nia.niovs[0]), >> + GFP_KERNEL | __GFP_ZERO); >> + if (!area->nia.niovs) >> + goto err; >> + >> + area->freelist = kvmalloc_array(nr_pages, sizeof(area->freelist[0]), >> + GFP_KERNEL | __GFP_ZERO); >> + if (!area->freelist) >> + goto err; >> + >> + for (i = 0; i < nr_pages; i++) { >> + area->freelist[i] = i; >> + } >> + >> + area->free_count = nr_pages; >> + area->ifq = ifq; >> + /* we're only supporting one area per ifq for now */ >> + area->area_id = 0; >> + area_reg->rq_area_token = (u64)area->area_id << IORING_ZCRX_AREA_SHIFT; >> + spin_lock_init(&area->freelist_lock); >> + *res = area; >> + return 0; >> +err: >> + if (area) >> + io_zcrx_free_area(area); >> + return ret; >> +} > > Minor nit, but I think this would be nicer returning area and just using > ERR_PTR() for the errors. I'd rather avoid it. Too often null vs IS_ERR checking gets messed up down the road and the compiler doesn't help with it at all. Not related to the patch, but would be nice to have a type safer way for that, e.g. returning some new type not directly cast'able to the pointer. > > Outside of that: > > Reviewed-by: Jens Axboe <axboe@kernel.dk> >
On 10/21/24 10:28 AM, Pavel Begunkov wrote: > On 10/21/24 16:35, Jens Axboe wrote: >> On 10/16/24 12:52 PM, David Wei wrote: >>> +static int io_zcrx_create_area(struct io_ring_ctx *ctx, >>> + struct io_zcrx_ifq *ifq, >>> + struct io_zcrx_area **res, >>> + struct io_uring_zcrx_area_reg *area_reg) >>> +{ >>> + struct io_zcrx_area *area; >>> + int i, ret, nr_pages; >>> + struct iovec iov; >>> + >>> + if (area_reg->flags || area_reg->rq_area_token) >>> + return -EINVAL; >>> + if (area_reg->__resv1 || area_reg->__resv2[0] || area_reg->__resv2[1]) >>> + return -EINVAL; >>> + if (area_reg->addr & ~PAGE_MASK || area_reg->len & ~PAGE_MASK) >>> + return -EINVAL; >>> + >>> + iov.iov_base = u64_to_user_ptr(area_reg->addr); >>> + iov.iov_len = area_reg->len; >>> + ret = io_buffer_validate(&iov); >>> + if (ret) >>> + return ret; >>> + >>> + ret = -ENOMEM; >>> + area = kzalloc(sizeof(*area), GFP_KERNEL); >>> + if (!area) >>> + goto err; >>> + >>> + area->pages = io_pin_pages((unsigned long)area_reg->addr, area_reg->len, >>> + &nr_pages); >>> + if (IS_ERR(area->pages)) { >>> + ret = PTR_ERR(area->pages); >>> + area->pages = NULL; >>> + goto err; >>> + } >>> + area->nia.num_niovs = nr_pages; >>> + >>> + area->nia.niovs = kvmalloc_array(nr_pages, sizeof(area->nia.niovs[0]), >>> + GFP_KERNEL | __GFP_ZERO); >>> + if (!area->nia.niovs) >>> + goto err; >>> + >>> + area->freelist = kvmalloc_array(nr_pages, sizeof(area->freelist[0]), >>> + GFP_KERNEL | __GFP_ZERO); >>> + if (!area->freelist) >>> + goto err; >>> + >>> + for (i = 0; i < nr_pages; i++) { >>> + area->freelist[i] = i; >>> + } >>> + >>> + area->free_count = nr_pages; >>> + area->ifq = ifq; >>> + /* we're only supporting one area per ifq for now */ >>> + area->area_id = 0; >>> + area_reg->rq_area_token = (u64)area->area_id << IORING_ZCRX_AREA_SHIFT; >>> + spin_lock_init(&area->freelist_lock); >>> + *res = area; >>> + return 0; >>> +err: >>> + if (area) >>> + io_zcrx_free_area(area); >>> + return ret; >>> +} >> >> Minor nit, but I think this would be nicer returning area and just using >> ERR_PTR() for the errors. > > I'd rather avoid it. Too often null vs IS_ERR checking gets > messed up down the road and the compiler doesn't help with it > at all. The main issue imho is when people mix NULL and ERR_PTR, the pure "valid pointer or non-null error pointer" seem to be OK in terms of maintainability. But like I said, not a huge deal, and it's not hot path material so doesn't matter in terms of that. > Not related to the patch, but would be nice to have a type safer > way for that, e.g. returning some new type not directly > cast'able to the pointer. Definitely, room for improvement in the infrastructure for this.
On 10/21/24 17:29, Jens Axboe wrote: > On 10/21/24 10:28 AM, Pavel Begunkov wrote: >> On 10/21/24 16:35, Jens Axboe wrote: >>> On 10/16/24 12:52 PM, David Wei wrote: >>>> +static int io_zcrx_create_area(struct io_ring_ctx *ctx, >>>> + struct io_zcrx_ifq *ifq, >>>> + struct io_zcrx_area **res, >>>> + struct io_uring_zcrx_area_reg *area_reg) >>>> +{ >>>> + struct io_zcrx_area *area; >>>> + int i, ret, nr_pages; >>>> + struct iovec iov; >>>> + >>>> + if (area_reg->flags || area_reg->rq_area_token) >>>> + return -EINVAL; >>>> + if (area_reg->__resv1 || area_reg->__resv2[0] || area_reg->__resv2[1]) >>>> + return -EINVAL; >>>> + if (area_reg->addr & ~PAGE_MASK || area_reg->len & ~PAGE_MASK) >>>> + return -EINVAL; >>>> + >>>> + iov.iov_base = u64_to_user_ptr(area_reg->addr); >>>> + iov.iov_len = area_reg->len; >>>> + ret = io_buffer_validate(&iov); >>>> + if (ret) >>>> + return ret; >>>> + >>>> + ret = -ENOMEM; >>>> + area = kzalloc(sizeof(*area), GFP_KERNEL); >>>> + if (!area) >>>> + goto err; >>>> + >>>> + area->pages = io_pin_pages((unsigned long)area_reg->addr, area_reg->len, >>>> + &nr_pages); >>>> + if (IS_ERR(area->pages)) { >>>> + ret = PTR_ERR(area->pages); >>>> + area->pages = NULL; >>>> + goto err; >>>> + } >>>> + area->nia.num_niovs = nr_pages; >>>> + >>>> + area->nia.niovs = kvmalloc_array(nr_pages, sizeof(area->nia.niovs[0]), >>>> + GFP_KERNEL | __GFP_ZERO); >>>> + if (!area->nia.niovs) >>>> + goto err; >>>> + >>>> + area->freelist = kvmalloc_array(nr_pages, sizeof(area->freelist[0]), >>>> + GFP_KERNEL | __GFP_ZERO); >>>> + if (!area->freelist) >>>> + goto err; >>>> + >>>> + for (i = 0; i < nr_pages; i++) { >>>> + area->freelist[i] = i; >>>> + } >>>> + >>>> + area->free_count = nr_pages; >>>> + area->ifq = ifq; >>>> + /* we're only supporting one area per ifq for now */ >>>> + area->area_id = 0; >>>> + area_reg->rq_area_token = (u64)area->area_id << IORING_ZCRX_AREA_SHIFT; >>>> + spin_lock_init(&area->freelist_lock); >>>> + *res = area; >>>> + return 0; >>>> +err: >>>> + if (area) >>>> + io_zcrx_free_area(area); >>>> + return ret; >>>> +} >>> >>> Minor nit, but I think this would be nicer returning area and just using >>> ERR_PTR() for the errors. >> >> I'd rather avoid it. Too often null vs IS_ERR checking gets >> messed up down the road and the compiler doesn't help with it >> at all. > > The main issue imho is when people mix NULL and ERR_PTR, the pure "valid > pointer or non-null error pointer" seem to be OK in terms of Right, I meant it in general, mixing normal pointer types with the implicit type that can have an error. > maintainability. But like I said, not a huge deal, and it's not hot path > material so doesn't matter in terms of that. I agree it's maintainable, but this way I don't even need to think about it. >> Not related to the patch, but would be nice to have a type safer >> way for that, e.g. returning some new type not directly >> cast'able to the pointer. > > Definitely, room for improvement in the infrastructure for this. >
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index d398e19f8eea..d43183264dcf 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -874,6 +874,15 @@ struct io_uring_zcrx_offsets { __u64 __resv[2]; }; +struct io_uring_zcrx_area_reg { + __u64 addr; + __u64 len; + __u64 rq_area_token; + __u32 flags; + __u32 __resv1; + __u64 __resv2[2]; +}; + /* * Argument for IORING_REGISTER_ZCRX_IFQ */ diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 33a3d156a85b..4da644de8843 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -86,7 +86,7 @@ static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) return 0; } -static int io_buffer_validate(struct iovec *iov) +int io_buffer_validate(struct iovec *iov) { unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1); diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 8ed588036210..0933dc99f41d 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -83,6 +83,7 @@ int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, unsigned size, unsigned type); int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, unsigned int size, unsigned int type); +int io_buffer_validate(struct iovec *iov); static inline void io_put_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node) { diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 4c53fd4f7bb3..a276572fe953 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -10,6 +10,7 @@ #include "kbuf.h" #include "memmap.h" #include "zcrx.h" +#include "rsrc.h" #define IO_RQ_MAX_ENTRIES 32768 @@ -38,6 +39,83 @@ static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq) ifq->rqes = NULL; } +static void io_zcrx_free_area(struct io_zcrx_area *area) +{ + if (area->freelist) + kvfree(area->freelist); + if (area->nia.niovs) + kvfree(area->nia.niovs); + if (area->pages) { + unpin_user_pages(area->pages, area->nia.num_niovs); + kvfree(area->pages); + } + kfree(area); +} + +static int io_zcrx_create_area(struct io_ring_ctx *ctx, + struct io_zcrx_ifq *ifq, + struct io_zcrx_area **res, + struct io_uring_zcrx_area_reg *area_reg) +{ + struct io_zcrx_area *area; + int i, ret, nr_pages; + struct iovec iov; + + if (area_reg->flags || area_reg->rq_area_token) + return -EINVAL; + if (area_reg->__resv1 || area_reg->__resv2[0] || area_reg->__resv2[1]) + return -EINVAL; + if (area_reg->addr & ~PAGE_MASK || area_reg->len & ~PAGE_MASK) + return -EINVAL; + + iov.iov_base = u64_to_user_ptr(area_reg->addr); + iov.iov_len = area_reg->len; + ret = io_buffer_validate(&iov); + if (ret) + return ret; + + ret = -ENOMEM; + area = kzalloc(sizeof(*area), GFP_KERNEL); + if (!area) + goto err; + + area->pages = io_pin_pages((unsigned long)area_reg->addr, area_reg->len, + &nr_pages); + if (IS_ERR(area->pages)) { + ret = PTR_ERR(area->pages); + area->pages = NULL; + goto err; + } + area->nia.num_niovs = nr_pages; + + area->nia.niovs = kvmalloc_array(nr_pages, sizeof(area->nia.niovs[0]), + GFP_KERNEL | __GFP_ZERO); + if (!area->nia.niovs) + goto err; + + area->freelist = kvmalloc_array(nr_pages, sizeof(area->freelist[0]), + GFP_KERNEL | __GFP_ZERO); + if (!area->freelist) + goto err; + + for (i = 0; i < nr_pages; i++) { + area->freelist[i] = i; + } + + area->free_count = nr_pages; + area->ifq = ifq; + /* we're only supporting one area per ifq for now */ + area->area_id = 0; + area_reg->rq_area_token = (u64)area->area_id << IORING_ZCRX_AREA_SHIFT; + spin_lock_init(&area->freelist_lock); + *res = area; + return 0; +err: + if (area) + io_zcrx_free_area(area); + return ret; +} + static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx) { struct io_zcrx_ifq *ifq; @@ -53,6 +131,9 @@ static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx) static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq) { + if (ifq->area) + io_zcrx_free_area(ifq->area); + io_free_rbuf_ring(ifq); kfree(ifq); } @@ -60,6 +141,7 @@ static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq) int io_register_zcrx_ifq(struct io_ring_ctx *ctx, struct io_uring_zcrx_ifq_reg __user *arg) { + struct io_uring_zcrx_area_reg area; struct io_uring_zcrx_ifq_reg reg; struct io_zcrx_ifq *ifq; size_t ring_sz, rqes_sz; @@ -91,7 +173,7 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, } reg.rq_entries = roundup_pow_of_two(reg.rq_entries); - if (!reg.area_ptr) + if (copy_from_user(&area, u64_to_user_ptr(reg.area_ptr), sizeof(area))) return -EFAULT; ifq = io_zcrx_ifq_alloc(ctx); @@ -102,6 +184,10 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, if (ret) goto err; + ret = io_zcrx_create_area(ctx, ifq, &ifq->area, &area); + if (ret) + goto err; + ifq->rq_entries = reg.rq_entries; ifq->if_rxq = reg.if_rxq; @@ -116,7 +202,10 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, ret = -EFAULT; goto err; } - + if (copy_to_user(u64_to_user_ptr(reg.area_ptr), &area, sizeof(area))) { + ret = -EFAULT; + goto err; + } ctx->ifq = ifq; return 0; err: diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h index 1f76eecac5fd..a8db61498c67 100644 --- a/io_uring/zcrx.h +++ b/io_uring/zcrx.h @@ -3,10 +3,26 @@ #define IOU_ZC_RX_H #include <linux/io_uring_types.h> +#include <net/page_pool/types.h> + +struct io_zcrx_area { + struct net_iov_area nia; + struct io_zcrx_ifq *ifq; + + u16 area_id; + struct page **pages; + + /* freelist */ + spinlock_t freelist_lock ____cacheline_aligned_in_smp; + u32 free_count; + u32 *freelist; +}; struct io_zcrx_ifq { struct io_ring_ctx *ctx; struct net_device *dev; + struct io_zcrx_area *area; + struct io_uring *rq_ring; struct io_uring_zcrx_rqe *rqes; u32 rq_entries;