Message ID | 20241111123650.1857526-1-xue01.he@samsung.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | [liburing] test: add test cases for hybrid iopoll | expand |
On 11/11/24 08:36PM, hexue wrote: >Add a test file for hybrid iopoll to make sure it works safe.Testcass >include basic read/write tests, and run in normal iopoll mode and >passthrough mode respectively. > >Signed-off-by: hexue <xue01.he@samsung.com> >--- > man/io_uring_setup.2 | 10 +- > src/include/liburing/io_uring.h | 3 + > test/Makefile | 1 + i> test/iopoll-hybridpoll.c | 544 ++++++++++++++++++++++++++++++++ > 4 files changed, 557 insertions(+), 1 deletion(-) > create mode 100644 test/iopoll-hybridpoll.c > >diff --git a/man/io_uring_setup.2 b/man/io_uring_setup.2 >index 2f87783..8cfafdc 100644 >--- a/man/io_uring_setup.2 >+++ b/man/io_uring_setup.2 >@@ -78,7 +78,15 @@ in question. For NVMe devices, the nvme driver must be loaded with the > parameter set to the desired number of polling queues. The polling queues > will be shared appropriately between the CPUs in the system, if the number > is less than the number of online CPU threads. >- >+.TP >+.B IORING_SETUP_HYBRID_IOPOLL >+This flag must setup with s/<must setup with>/<must be used with> >+.B IORING_SETUP_IOPOLL >+flag. hybrid poll is a new s/hybrid/Hybrid "new" is probably not required here >+feature baed on iopoll, this could be a suboptimal solution when running s/baed/based s/<this could>/<which might> >+on a single thread, it offers higher performance than IRQ and lower CPU s/<, it>/<. It> >+utilization than polling. Similarly, this feature also requires the devices >+to support polling configuration. This feature would work if a device doesn't have polled queues,right? The performance might be suboptimal in that case, but the userspace won't get any errors. > .TP > .B IORING_SETUP_SQPOLL > When this flag is specified, a kernel thread is created to perform >diff --git a/src/include/liburing/io_uring.h b/src/include/liburing/io_uring.h >index 20bc570..d16364c 100644 >--- a/src/include/liburing/io_uring.h >+++ b/src/include/liburing/io_uring.h >@@ -200,6 +200,9 @@ enum io_uring_sqe_flags_bit { > */ > #define IORING_SETUP_NO_SQARRAY (1U << 16) > >+/* Use hybrid poll in iopoll process */ >+#define IORING_SETUP_HYBRID_IOPOLL (1U << 17) >+ > enum io_uring_op { > IORING_OP_NOP, > IORING_OP_READV, >diff --git a/test/Makefile b/test/Makefile >index dfbbcbe..ea9452c 100644 >--- a/test/Makefile >+++ b/test/Makefile >@@ -116,6 +116,7 @@ test_srcs := \ > iopoll.c \ > iopoll-leak.c \ > iopoll-overflow.c \ >+ iopoll-hybridpoll.c \ > io_uring_enter.c \ > io_uring_passthrough.c \ > io_uring_register.c \ >diff --git a/test/iopoll-hybridpoll.c b/test/iopoll-hybridpoll.c >new file mode 100644 >index 0000000..d7c08ae >--- /dev/null >+++ b/test/iopoll-hybridpoll.c >@@ -0,0 +1,544 @@ >+/* SPDX-License-Identifier: MIT */ >+/* >+ * Description: basic read/write tests with >+ * hybrid polled IO, include iopoll and io_uring >+ * passthrough. >+ */ >+#include <errno.h> >+#include <stdio.h> >+#include <unistd.h> >+#include <stdlib.h> >+#include <string.h> >+#include <fcntl.h> >+#include <sys/types.h> >+#include <poll.h> >+#include <sys/eventfd.h> >+#include <sys/resource.h> >+#include "helpers.h" >+#include "liburing.h" >+#include "../src/syscall.h" >+#include "nvme.h" >+ >+#define FILE_SIZE (128 * 1024) >+#define BS 4096 >+#define BUFFERS (FILE_SIZE / BS) >+ >+static struct iovec *vecs; >+static int no_pt, no_iopoll; >+ >+static int fill_pattern(int tc) >+{ >+ unsigned int val, *ptr; >+ int i, j; >+ int u_in_buf = BS / sizeof(val); >+ >+ val = (tc / 2) * FILE_SIZE; >+ for (i = 0; i < BUFFERS; i++) { >+ ptr = vecs[i].iov_base; >+ for (j = 0; j < u_in_buf; j++) { >+ *ptr = val; >+ val++; >+ ptr++; >+ } >+ } >+ >+ return 0; >+} >+ >+static int verify_buf(int tc, void *buf, off_t off) >+{ >+ int i, u_in_buf = BS / sizeof(unsigned int); >+ unsigned int *ptr; >+ >+ off /= sizeof(unsigned int); >+ off += (tc / 2) * FILE_SIZE; >+ ptr = buf; >+ for (i = 0; i < u_in_buf; i++) { >+ if (off != *ptr) { >+ fprintf(stderr, "Found %u, wanted %llu\n", *ptr, >+ (unsigned long long) off); >+ return 1; >+ } >+ ptr++; >+ off++; >+ } >+ >+ return 0; >+} >+ >+static int __test_io_uring_passthrough_io(const char *file, struct io_uring *ring, int tc, int write, >+ int sqthread, int fixed, int nonvec) >+{ >+ struct io_uring_sqe *sqe; >+ struct io_uring_cqe *cqe; >+ struct nvme_uring_cmd *cmd; >+ int open_flags; >+ int do_fixed; >+ int i, ret, fd = -1; >+ off_t offset; >+ __u64 slba; >+ __u32 nlb; >+ >+ if (write) >+ open_flags = O_WRONLY; >+ else >+ open_flags = O_RDONLY; >+ >+ if (fixed) { >+ ret = t_register_buffers(ring, vecs, BUFFERS); >+ if (ret == T_SETUP_SKIP) >+ return 0; >+ if (ret != T_SETUP_OK) { >+ fprintf(stderr, "buffer reg failed: %d\n", ret); >+ goto err; >+ } >+ } >+ >+ fd = open(file, open_flags); >+ if (fd < 0) { >+ if (errno == EACCES || errno == EPERM) >+ return T_EXIT_SKIP; >+ perror("file open"); >+ goto err; >+ } >+ >+ if (sqthread) { >+ ret = io_uring_register_files(ring, &fd, 1); >+ if (ret) { >+ fprintf(stderr, "file reg failed: %d\n", ret); >+ goto err; >+ } >+ } >+ >+ if (write) >+ fill_pattern(tc); >+ >+ offset = 0; >+ for (i = 0; i < BUFFERS; i++) { >+ sqe = io_uring_get_sqe(ring); >+ if (!sqe) { >+ fprintf(stderr, "sqe get failed\n"); >+ goto err; >+ } >+ if (write) { >+ int use_fd = fd; >+ >+ do_fixed = fixed; >+ >+ if (sqthread) >+ use_fd = 0; >+ if (fixed && (i & 1)) >+ do_fixed = 0; >+ if (do_fixed) { >+ io_uring_prep_write_fixed(sqe, use_fd, vecs[i].iov_base, >+ vecs[i].iov_len, >+ offset, i); >+ sqe->cmd_op = NVME_URING_CMD_IO; >+ } else if (nonvec) { >+ io_uring_prep_write(sqe, use_fd, vecs[i].iov_base, >+ vecs[i].iov_len, offset); >+ sqe->cmd_op = NVME_URING_CMD_IO; >+ } else { >+ io_uring_prep_writev(sqe, use_fd, &vecs[i], 1, >+ offset); >+ sqe->cmd_op = NVME_URING_CMD_IO_VEC; >+ } >+ } else { >+ int use_fd = fd; >+ >+ do_fixed = fixed; >+ >+ if (sqthread) >+ use_fd = 0; >+ if (fixed && (i & 1)) >+ do_fixed = 0; >+ if (do_fixed) { >+ io_uring_prep_read_fixed(sqe, use_fd, vecs[i].iov_base, >+ vecs[i].iov_len, >+ offset, i); >+ sqe->cmd_op = NVME_URING_CMD_IO; >+ } else if (nonvec) { >+ io_uring_prep_read(sqe, use_fd, vecs[i].iov_base, >+ vecs[i].iov_len, offset); >+ sqe->cmd_op = NVME_URING_CMD_IO; >+ } else { >+ io_uring_prep_readv(sqe, use_fd, &vecs[i], 1, >+ offset); >+ sqe->cmd_op = NVME_URING_CMD_IO_VEC; >+ } >+ } >+ sqe->opcode = IORING_OP_URING_CMD; >+ if (do_fixed) >+ sqe->uring_cmd_flags |= IORING_URING_CMD_FIXED; >+ sqe->user_data = ((uint64_t)offset << 32) | i; >+ if (sqthread) >+ sqe->flags |= IOSQE_FIXED_FILE; >+ >+ cmd = (struct nvme_uring_cmd *)sqe->cmd; >+ memset(cmd, 0, sizeof(struct nvme_uring_cmd)); >+ >+ cmd->opcode = write ? nvme_cmd_write : nvme_cmd_read; >+ >+ slba = offset >> lba_shift; >+ nlb = (BS >> lba_shift) - 1; >+ >+ /* cdw10 and cdw11 represent starting lba */ >+ cmd->cdw10 = slba & 0xffffffff; >+ cmd->cdw11 = slba >> 32; >+ /* cdw12 represent number of lba's for read/write */ >+ cmd->cdw12 = nlb; >+ if (do_fixed || nonvec) { >+ cmd->addr = (__u64)(uintptr_t)vecs[i].iov_base; >+ cmd->data_len = vecs[i].iov_len; >+ } else { >+ cmd->addr = (__u64)(uintptr_t)&vecs[i]; >+ cmd->data_len = 1; >+ } >+ cmd->nsid = nsid; >+ >+ offset += BS; >+ } >+ >+ ret = io_uring_submit(ring); >+ if (ret != BUFFERS) { >+ fprintf(stderr, "submit got %d, wanted %d\n", ret, BUFFERS); >+ goto err; >+ } >+ >+ for (i = 0; i < BUFFERS; i++) { >+ ret = io_uring_wait_cqe(ring, &cqe); >+ if (ret) { >+ fprintf(stderr, "wait_cqe=%d\n", ret); >+ goto err; >+ } >+ if (cqe->res != 0) { >+ if (!no_pt) { >+ no_pt = 1; >+ goto skip; >+ } >+ fprintf(stderr, "cqe res %d, wanted 0\n", cqe->res); >+ goto err; >+ } >+ io_uring_cqe_seen(ring, cqe); >+ if (!write) { >+ int index = cqe->user_data & 0xffffffff; >+ void *buf = vecs[index].iov_base; >+ off_t voff = cqe->user_data >> 32; >+ >+ if (verify_buf(tc, buf, voff)) >+ goto err; >+ } >+ } >+ >+ if (fixed) { >+ ret = io_uring_unregister_buffers(ring); >+ if (ret) { >+ fprintf(stderr, "buffer unreg failed: %d\n", ret); >+ goto err; >+ } >+ } >+ if (sqthread) { >+ ret = io_uring_unregister_files(ring); >+ if (ret) { >+ fprintf(stderr, "file unreg failed: %d\n", ret); >+ goto err; >+ } >+ } >+ >+skip: >+ close(fd); >+ return 0; >+err: >+ if (fd != -1) >+ close(fd); >+ return 1; >+} >+ >+static int test_io_uring_passthrough(const char *file, int tc, int write, int sqthread, >+ int fixed, int nonvec) >+{ >+ struct io_uring ring; >+ int ret, ring_flags = 0; >+ >+ ring_flags |= IORING_SETUP_SQE128; >+ ring_flags |= IORING_SETUP_CQE32; >+ ring_flags |= IORING_SETUP_HYBRID_IOPOLL; >+ >+ if (sqthread) >+ ring_flags |= IORING_SETUP_SQPOLL; >+ >+ ret = t_create_ring(64, &ring, ring_flags); >+ if (ret == T_SETUP_SKIP) >+ return 0; >+ if (ret != T_SETUP_OK) { >+ if (ret == -EINVAL) { >+ no_pt = 1; >+ return T_SETUP_SKIP; >+ } >+ fprintf(stderr, "ring create failed: %d\n", ret); >+ return 1; >+ } >+ >+ ret = __test_io_uring_passthrough_io(file, &ring, tc, write, sqthread, fixed, nonvec); >+ io_uring_queue_exit(&ring); >+ >+ return ret; >+} >+ >+static int provide_buffers(struct io_uring *ring) >+{ >+ struct io_uring_sqe *sqe; >+ struct io_uring_cqe *cqe; >+ int ret, i; >+ >+ for (i = 0; i < BUFFERS; i++) { >+ sqe = io_uring_get_sqe(ring); >+ io_uring_prep_provide_buffers(sqe, vecs[i].iov_base, >+ vecs[i].iov_len, 1, 1, i); >+ } >+ >+ ret = io_uring_submit(ring); >+ if (ret != BUFFERS) { >+ fprintf(stderr, "submit: %d\n", ret); >+ return 1; >+ } >+ >+ for (i = 0; i < BUFFERS; i++) { >+ ret = io_uring_wait_cqe(ring, &cqe); >+ if (cqe->res < 0) { >+ fprintf(stderr, "cqe->res=%d\n", cqe->res); >+ return 1; >+ } >+ io_uring_cqe_seen(ring, cqe); >+ } >+ >+ return 0; >+} >+ >+static int __test_iopoll_io(const char *file, struct io_uring *ring, int write, int sqthread, >+ int fixed, int buf_select) >+{ >+ struct io_uring_sqe *sqe; >+ struct io_uring_cqe *cqe; >+ int open_flags; >+ int i, fd = -1, ret; >+ off_t offset; >+ >+ if (buf_select) { >+ write = 0; >+ fixed = 0; >+ } >+ if (buf_select && provide_buffers(ring)) >+ return 1; >+ >+ if (write) >+ open_flags = O_WRONLY; >+ else >+ open_flags = O_RDONLY; >+ open_flags |= O_DIRECT; >+ >+ if (fixed) { >+ ret = t_register_buffers(ring, vecs, BUFFERS); >+ if (ret == T_SETUP_SKIP) >+ return 0; >+ if (ret != T_SETUP_OK) { >+ fprintf(stderr, "buffer reg failed: %d\n", ret); >+ goto err; >+ } >+ } >+ fd = open(file, open_flags); >+ if (fd < 0) { >+ if (errno == EINVAL || errno == EPERM || errno == EACCES) >+ return 0; >+ perror("file open"); >+ goto err; >+ } >+ if (sqthread) { >+ ret = io_uring_register_files(ring, &fd, 1); >+ if (ret) { >+ fprintf(stderr, "file reg failed: %d\n", ret); >+ goto err; >+ } >+ } >+ >+ offset = 0; >+ for (i = 0; i < BUFFERS; i++) { >+ sqe = io_uring_get_sqe(ring); >+ if (!sqe) { >+ fprintf(stderr, "sqe get failed\n"); >+ goto err; >+ } >+ offset = BS * (rand() % BUFFERS); >+ if (write) { >+ int do_fixed = fixed; >+ int use_fd = fd; >+ >+ if (sqthread) >+ use_fd = 0; >+ if (fixed && (i & 1)) >+ do_fixed = 0; >+ if (do_fixed) { >+ io_uring_prep_write_fixed(sqe, use_fd, vecs[i].iov_base, >+ vecs[i].iov_len, >+ offset, i); >+ } else { >+ io_uring_prep_writev(sqe, use_fd, &vecs[i], 1, >+ offset); >+ } >+ } else { >+ int do_fixed = fixed; >+ int use_fd = fd; >+ >+ if (sqthread) >+ use_fd = 0; >+ if (fixed && (i & 1)) >+ do_fixed = 0; >+ if (do_fixed) { >+ io_uring_prep_read_fixed(sqe, use_fd, vecs[i].iov_base, >+ vecs[i].iov_len, >+ offset, i); >+ } else { >+ io_uring_prep_readv(sqe, use_fd, &vecs[i], 1, >+ offset); >+ } >+ >+ } >+ if (sqthread) >+ sqe->flags |= IOSQE_FIXED_FILE; >+ if (buf_select) { >+ sqe->flags |= IOSQE_BUFFER_SELECT; >+ sqe->buf_group = buf_select; >+ sqe->user_data = i; >+ } >+ } >+ >+ ret = io_uring_submit(ring); >+ if (ret != BUFFERS) { >+ ret = io_uring_peek_cqe(ring, &cqe); >+ if (!ret && cqe->res == -EOPNOTSUPP) { >+ no_iopoll = 1; >+ io_uring_cqe_seen(ring, cqe); >+ goto out; >+ } >+ fprintf(stderr, "submit got %d, wanted %d\n", ret, BUFFERS); >+ goto err; >+ } >+ >+ for (i = 0; i < BUFFERS; i++) { >+ ret = io_uring_wait_cqe(ring, &cqe); >+ if (ret) { >+ fprintf(stderr, "wait_cqe=%d\n", ret); >+ goto err; >+ } else if (cqe->res == -EOPNOTSUPP) { >+ fprintf(stdout, "File/device/fs doesn't support polled IO\n"); >+ no_iopoll = 1; >+ goto out; >+ } else if (cqe->res != BS) { >+ fprintf(stderr, "cqe res %d, wanted %d\n", cqe->res, BS); >+ goto err; >+ } >+ io_uring_cqe_seen(ring, cqe); >+ } >+ >+ if (fixed) { >+ ret = io_uring_unregister_buffers(ring); >+ if (ret) { >+ fprintf(stderr, "buffer unreg failed: %d\n", ret); >+ goto err; >+ } >+ } >+ if (sqthread) { >+ ret = io_uring_unregister_files(ring); >+ if (ret) { >+ fprintf(stderr, "file unreg failed: %d\n", ret); >+ goto err; >+ } >+ } >+ >+out: >+ close(fd); >+ return 0; >+err: >+ if (fd != -1) >+ close(fd); >+ return 1; >+} >+ >+static int test_iopoll(const char *fname, int write, int sqthread, int fixed, >+ int buf_select, int defer) >+{ >+ struct io_uring ring; >+ int ret, ring_flags = IORING_SETUP_IOPOLL | IORING_SETUP_HYBRID_IOPOLL; >+ >+ if (no_iopoll) >+ return 0; >+ >+ if (defer) >+ ring_flags |= IORING_SETUP_SINGLE_ISSUER | >+ IORING_SETUP_DEFER_TASKRUN; >+ >+ ret = t_create_ring(64, &ring, ring_flags); >+ if (ret == T_SETUP_SKIP) >+ return 0; >+ if (ret != T_SETUP_OK) { >+ fprintf(stderr, "ring create failed: %d\n", ret); >+ return 1; >+ } >+ ret = __test_iopoll_io(fname, &ring, write, sqthread, fixed, buf_select); >+ io_uring_queue_exit(&ring); >+ return ret; >+} >+ >+int main(int argc, char *argv[]) >+{ >+ int i, ret; >+ char buf[256]; >+ char *fname; >+ >+ if (argc > 1) { >+ fname = argv[1]; >+ } else { >+ srand((unsigned)time(NULL)); >+ snprintf(buf, sizeof(buf), ".basic-rw-%u-%u", >+ (unsigned)rand(), (unsigned)getpid()); >+ fname = buf; >+ t_create_file(fname, FILE_SIZE); >+ } >+ >+ vecs = t_create_buffers(BUFFERS, BS); >+ >+ for (i = 0; i < 16; i++) { >+ int write = (i & 1) != 0; >+ int sqthread = (i & 2) != 0; >+ int fixed = (i & 4) != 0; >+ int buf_select = (i & 8) != 0; >+ int defer = (i & 16) != 0; >+ int nonvec = buf_select; >+ >+ ret = test_iopoll(fname, write, sqthread, fixed, buf_select, defer); >+ if (ret) { >+ fprintf(stderr, "test_iopoll_io failed %d/%d/%d/%d/%d\n", >+ write, sqthread, fixed, buf_select, defer); >+ goto err; >+ } >+ if (no_iopoll) >+ break; >+ >+ ret = test_io_uring_passthrough(fname, i, write, sqthread, fixed, nonvec); >+ if (no_pt) >+ break; >+ if (ret) { >+ fprintf(stderr, "test_io_uring_passthrough_io failed %d/%d/%d/%d\n", >+ write, sqthread, fixed, nonvec); >+ goto err; >+ } >+ } >+ >+ if (fname != argv[1]) >+ unlink(fname); >+ return ret; >+err: >+ if (fname != argv[1]) >+ unlink(fname); >+ return T_EXIT_FAIL; >+} This patch mostly looks fine. But the code here seems to be largely duplicated from "test/io_uring_passthrough.c" and "test/iopoll.c". Can we consider adding the hybrid poll test as a part of the existing tests as it seems that it would only require passing a extra flag during ring setup. >-- >2.34.1 >
On 11/12/24 3:44 AM, Anuj Gupta wrote: >> +utilization than polling. Similarly, this feature also requires the devices >> +to support polling configuration. > This feature would work if a device doesn't have polled queues,right? > The performance might be suboptimal in that case, but the userspace won't > get any errors. We've traditionally been a mix of lax and strict on this. IMHO we should return -EOPTNOTSUPP for IOPOLL (and IOPOLL|HYBRID) if polling isn't configured correctly. I've seen way too many not realize that they need to configure their nvme side for pollable queues for it to do what it needs to do. If you don't and it's just allowed, then you don't really get much of a win, you're just burning CPU. Hence I do think that this should strongly recommend that the devices support polling, that part is fine. Agree with your other comments, thanks for reviewing it! > This patch mostly looks fine. But the code here seems to be largely > duplicated from "test/io_uring_passthrough.c" and "test/iopoll.c". > Can we consider adding the hybrid poll test as a part of the existing > tests as it seems that it would only require passing a extra flag > during ring setup. Yeah I do think modifying test/iopoll.c to test all the same configurations but with HYBRID added would be the way to go, rather than duplicate all of this. Ditto for passthrough.
On 11/11/24 12:36, hexue wrote: > Add a test file for hybrid iopoll to make sure it works safe.Testcass > include basic read/write tests, and run in normal iopoll mode and > passthrough mode respectively. > > Signed-off-by: hexue <xue01.he@samsung.com> If it's not covered already please add tests for failure cases. E.g. when SETUP_HYBRID_IOPOLL is set without SETUP_IOPOLL > +static int test_io_uring_passthrough(const char *file, int tc, int write, int sqthread, > + int fixed, int nonvec) > +{ > + struct io_uring ring; > + int ret, ring_flags = 0; > + > + ring_flags |= IORING_SETUP_SQE128; > + ring_flags |= IORING_SETUP_CQE32; > + ring_flags |= IORING_SETUP_HYBRID_IOPOLL; > + > + if (sqthread) > + ring_flags |= IORING_SETUP_SQPOLL; Doesn't it also need IORING_SETUP_IOPOLL? > + > + ret = t_create_ring(64, &ring, ring_flags); > + if (ret == T_SETUP_SKIP) > + return 0; > + if (ret != T_SETUP_OK) { > + if (ret == -EINVAL) { > + no_pt = 1; > + return T_SETUP_SKIP; > + } > + fprintf(stderr, "ring create failed: %d\n", ret); > + return 1; > + } > + > + ret = __test_io_uring_passthrough_io(file, &ring, tc, write, sqthread, fixed, nonvec); > + io_uring_queue_exit(&ring); > + > + return ret; > +}
On 11/13/24 3:32, Pavel Begunkov wrote: >On 11/11/24 12:36, hexue wrote: >> Add a test file for hybrid iopoll to make sure it works safe.Testcass >> include basic read/write tests, and run in normal iopoll mode and >> passthrough mode respectively. >> >> Signed-off-by: hexue <xue01.he@samsung.com> >If it's not covered already please add tests for failure cases. >E.g. when SETUP_HYBRID_IOPOLL is set without SETUP_IOPOLL I'll thinking about this part. >> + ring_flags |= IORING_SETUP_SQE128; >> + ring_flags |= IORING_SETUP_CQE32; >> + ring_flags |= IORING_SETUP_HYBRID_IOPOLL; >> + >> + if (sqthread) >> + ring_flags |= IORING_SETUP_SQPOLL; >Doesn't it also need IORING_SETUP_IOPOLL? You're right, will fix this. -- hexue
On 11/12/24 3:44 AM, Anuj Gupta wrote: >> +utilization than polling. Similarly, this feature also requires the devices >> +to support polling configuration. > This feature would work if a device doesn't have polled queues,right? > The performance might be suboptimal in that case, but the userspace won't > get any errors. The usersapce will get error in this case. Hybrid must be dependent on iopoll, if hybrid polling is set separately,it will not pass detection and an error will be returned. Therefore, device end cnnfigured are required for polling. ...... > This patch mostly looks fine. But the code here seems to be largely > duplicated from "test/io_uring_passthrough.c" and "test/iopoll.c". > Can we consider adding the hybrid poll test as a part of the existing > tests as it seems that it would only require passing a extra flag > during ring setup. Make sense, I will try to do this in exsiting test file. Thank you very much. -- hexue
On 11/12/24 15:43, Jens Axboe wrote: >On 11/12/24 3:44 AM, Anuj Gupta wrote: >>> +utilization than polling. Similarly, this feature also requires the devices >>> +to support polling configuration. >> This feature would work if a device doesn't have polled queues,right? >> The performance might be suboptimal in that case, but the userspace won't >> get any errors. >We've traditionally been a mix of lax and strict on this. IMHO we should >return -EOPTNOTSUPP for IOPOLL (and IOPOLL|HYBRID) if polling isn't >configured correctly. I've seen way too many not realize that they need >to configure their nvme side for pollable queues for it to do what it >needs to do. If you don't and it's just allowed, then you don't really >get much of a win, you're just burning CPU. > >Hence I do think that this should strongly recommend that the devices >support polling, that part is fine. > >Agree with your other comments, thanks for reviewing it! >> This patch mostly looks fine. But the code here seems to be largely >> duplicated from "test/io_uring_passthrough.c" and "test/iopoll.c". >> Can we consider adding the hybrid poll test as a part of the existing >> tests as it seems that it would only require passing a extra flag >> during ring setup. > >Yeah I do think modifying test/iopoll.c to test all the same >configurations but with HYBRID added would be the way to go, rather than >duplicate all of this. Ditto for passthrough. Got it, will add it and submit v2 patch. -- hexue
diff --git a/man/io_uring_setup.2 b/man/io_uring_setup.2 index 2f87783..8cfafdc 100644 --- a/man/io_uring_setup.2 +++ b/man/io_uring_setup.2 @@ -78,7 +78,15 @@ in question. For NVMe devices, the nvme driver must be loaded with the parameter set to the desired number of polling queues. The polling queues will be shared appropriately between the CPUs in the system, if the number is less than the number of online CPU threads. - +.TP +.B IORING_SETUP_HYBRID_IOPOLL +This flag must setup with +.B IORING_SETUP_IOPOLL +flag. hybrid poll is a new +feature baed on iopoll, this could be a suboptimal solution when running +on a single thread, it offers higher performance than IRQ and lower CPU +utilization than polling. Similarly, this feature also requires the devices +to support polling configuration. .TP .B IORING_SETUP_SQPOLL When this flag is specified, a kernel thread is created to perform diff --git a/src/include/liburing/io_uring.h b/src/include/liburing/io_uring.h index 20bc570..d16364c 100644 --- a/src/include/liburing/io_uring.h +++ b/src/include/liburing/io_uring.h @@ -200,6 +200,9 @@ enum io_uring_sqe_flags_bit { */ #define IORING_SETUP_NO_SQARRAY (1U << 16) +/* Use hybrid poll in iopoll process */ +#define IORING_SETUP_HYBRID_IOPOLL (1U << 17) + enum io_uring_op { IORING_OP_NOP, IORING_OP_READV, diff --git a/test/Makefile b/test/Makefile index dfbbcbe..ea9452c 100644 --- a/test/Makefile +++ b/test/Makefile @@ -116,6 +116,7 @@ test_srcs := \ iopoll.c \ iopoll-leak.c \ iopoll-overflow.c \ + iopoll-hybridpoll.c \ io_uring_enter.c \ io_uring_passthrough.c \ io_uring_register.c \ diff --git a/test/iopoll-hybridpoll.c b/test/iopoll-hybridpoll.c new file mode 100644 index 0000000..d7c08ae --- /dev/null +++ b/test/iopoll-hybridpoll.c @@ -0,0 +1,544 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Description: basic read/write tests with + * hybrid polled IO, include iopoll and io_uring + * passthrough. + */ +#include <errno.h> +#include <stdio.h> +#include <unistd.h> +#include <stdlib.h> +#include <string.h> +#include <fcntl.h> +#include <sys/types.h> +#include <poll.h> +#include <sys/eventfd.h> +#include <sys/resource.h> +#include "helpers.h" +#include "liburing.h" +#include "../src/syscall.h" +#include "nvme.h" + +#define FILE_SIZE (128 * 1024) +#define BS 4096 +#define BUFFERS (FILE_SIZE / BS) + +static struct iovec *vecs; +static int no_pt, no_iopoll; + +static int fill_pattern(int tc) +{ + unsigned int val, *ptr; + int i, j; + int u_in_buf = BS / sizeof(val); + + val = (tc / 2) * FILE_SIZE; + for (i = 0; i < BUFFERS; i++) { + ptr = vecs[i].iov_base; + for (j = 0; j < u_in_buf; j++) { + *ptr = val; + val++; + ptr++; + } + } + + return 0; +} + +static int verify_buf(int tc, void *buf, off_t off) +{ + int i, u_in_buf = BS / sizeof(unsigned int); + unsigned int *ptr; + + off /= sizeof(unsigned int); + off += (tc / 2) * FILE_SIZE; + ptr = buf; + for (i = 0; i < u_in_buf; i++) { + if (off != *ptr) { + fprintf(stderr, "Found %u, wanted %llu\n", *ptr, + (unsigned long long) off); + return 1; + } + ptr++; + off++; + } + + return 0; +} + +static int __test_io_uring_passthrough_io(const char *file, struct io_uring *ring, int tc, int write, + int sqthread, int fixed, int nonvec) +{ + struct io_uring_sqe *sqe; + struct io_uring_cqe *cqe; + struct nvme_uring_cmd *cmd; + int open_flags; + int do_fixed; + int i, ret, fd = -1; + off_t offset; + __u64 slba; + __u32 nlb; + + if (write) + open_flags = O_WRONLY; + else + open_flags = O_RDONLY; + + if (fixed) { + ret = t_register_buffers(ring, vecs, BUFFERS); + if (ret == T_SETUP_SKIP) + return 0; + if (ret != T_SETUP_OK) { + fprintf(stderr, "buffer reg failed: %d\n", ret); + goto err; + } + } + + fd = open(file, open_flags); + if (fd < 0) { + if (errno == EACCES || errno == EPERM) + return T_EXIT_SKIP; + perror("file open"); + goto err; + } + + if (sqthread) { + ret = io_uring_register_files(ring, &fd, 1); + if (ret) { + fprintf(stderr, "file reg failed: %d\n", ret); + goto err; + } + } + + if (write) + fill_pattern(tc); + + offset = 0; + for (i = 0; i < BUFFERS; i++) { + sqe = io_uring_get_sqe(ring); + if (!sqe) { + fprintf(stderr, "sqe get failed\n"); + goto err; + } + if (write) { + int use_fd = fd; + + do_fixed = fixed; + + if (sqthread) + use_fd = 0; + if (fixed && (i & 1)) + do_fixed = 0; + if (do_fixed) { + io_uring_prep_write_fixed(sqe, use_fd, vecs[i].iov_base, + vecs[i].iov_len, + offset, i); + sqe->cmd_op = NVME_URING_CMD_IO; + } else if (nonvec) { + io_uring_prep_write(sqe, use_fd, vecs[i].iov_base, + vecs[i].iov_len, offset); + sqe->cmd_op = NVME_URING_CMD_IO; + } else { + io_uring_prep_writev(sqe, use_fd, &vecs[i], 1, + offset); + sqe->cmd_op = NVME_URING_CMD_IO_VEC; + } + } else { + int use_fd = fd; + + do_fixed = fixed; + + if (sqthread) + use_fd = 0; + if (fixed && (i & 1)) + do_fixed = 0; + if (do_fixed) { + io_uring_prep_read_fixed(sqe, use_fd, vecs[i].iov_base, + vecs[i].iov_len, + offset, i); + sqe->cmd_op = NVME_URING_CMD_IO; + } else if (nonvec) { + io_uring_prep_read(sqe, use_fd, vecs[i].iov_base, + vecs[i].iov_len, offset); + sqe->cmd_op = NVME_URING_CMD_IO; + } else { + io_uring_prep_readv(sqe, use_fd, &vecs[i], 1, + offset); + sqe->cmd_op = NVME_URING_CMD_IO_VEC; + } + } + sqe->opcode = IORING_OP_URING_CMD; + if (do_fixed) + sqe->uring_cmd_flags |= IORING_URING_CMD_FIXED; + sqe->user_data = ((uint64_t)offset << 32) | i; + if (sqthread) + sqe->flags |= IOSQE_FIXED_FILE; + + cmd = (struct nvme_uring_cmd *)sqe->cmd; + memset(cmd, 0, sizeof(struct nvme_uring_cmd)); + + cmd->opcode = write ? nvme_cmd_write : nvme_cmd_read; + + slba = offset >> lba_shift; + nlb = (BS >> lba_shift) - 1; + + /* cdw10 and cdw11 represent starting lba */ + cmd->cdw10 = slba & 0xffffffff; + cmd->cdw11 = slba >> 32; + /* cdw12 represent number of lba's for read/write */ + cmd->cdw12 = nlb; + if (do_fixed || nonvec) { + cmd->addr = (__u64)(uintptr_t)vecs[i].iov_base; + cmd->data_len = vecs[i].iov_len; + } else { + cmd->addr = (__u64)(uintptr_t)&vecs[i]; + cmd->data_len = 1; + } + cmd->nsid = nsid; + + offset += BS; + } + + ret = io_uring_submit(ring); + if (ret != BUFFERS) { + fprintf(stderr, "submit got %d, wanted %d\n", ret, BUFFERS); + goto err; + } + + for (i = 0; i < BUFFERS; i++) { + ret = io_uring_wait_cqe(ring, &cqe); + if (ret) { + fprintf(stderr, "wait_cqe=%d\n", ret); + goto err; + } + if (cqe->res != 0) { + if (!no_pt) { + no_pt = 1; + goto skip; + } + fprintf(stderr, "cqe res %d, wanted 0\n", cqe->res); + goto err; + } + io_uring_cqe_seen(ring, cqe); + if (!write) { + int index = cqe->user_data & 0xffffffff; + void *buf = vecs[index].iov_base; + off_t voff = cqe->user_data >> 32; + + if (verify_buf(tc, buf, voff)) + goto err; + } + } + + if (fixed) { + ret = io_uring_unregister_buffers(ring); + if (ret) { + fprintf(stderr, "buffer unreg failed: %d\n", ret); + goto err; + } + } + if (sqthread) { + ret = io_uring_unregister_files(ring); + if (ret) { + fprintf(stderr, "file unreg failed: %d\n", ret); + goto err; + } + } + +skip: + close(fd); + return 0; +err: + if (fd != -1) + close(fd); + return 1; +} + +static int test_io_uring_passthrough(const char *file, int tc, int write, int sqthread, + int fixed, int nonvec) +{ + struct io_uring ring; + int ret, ring_flags = 0; + + ring_flags |= IORING_SETUP_SQE128; + ring_flags |= IORING_SETUP_CQE32; + ring_flags |= IORING_SETUP_HYBRID_IOPOLL; + + if (sqthread) + ring_flags |= IORING_SETUP_SQPOLL; + + ret = t_create_ring(64, &ring, ring_flags); + if (ret == T_SETUP_SKIP) + return 0; + if (ret != T_SETUP_OK) { + if (ret == -EINVAL) { + no_pt = 1; + return T_SETUP_SKIP; + } + fprintf(stderr, "ring create failed: %d\n", ret); + return 1; + } + + ret = __test_io_uring_passthrough_io(file, &ring, tc, write, sqthread, fixed, nonvec); + io_uring_queue_exit(&ring); + + return ret; +} + +static int provide_buffers(struct io_uring *ring) +{ + struct io_uring_sqe *sqe; + struct io_uring_cqe *cqe; + int ret, i; + + for (i = 0; i < BUFFERS; i++) { + sqe = io_uring_get_sqe(ring); + io_uring_prep_provide_buffers(sqe, vecs[i].iov_base, + vecs[i].iov_len, 1, 1, i); + } + + ret = io_uring_submit(ring); + if (ret != BUFFERS) { + fprintf(stderr, "submit: %d\n", ret); + return 1; + } + + for (i = 0; i < BUFFERS; i++) { + ret = io_uring_wait_cqe(ring, &cqe); + if (cqe->res < 0) { + fprintf(stderr, "cqe->res=%d\n", cqe->res); + return 1; + } + io_uring_cqe_seen(ring, cqe); + } + + return 0; +} + +static int __test_iopoll_io(const char *file, struct io_uring *ring, int write, int sqthread, + int fixed, int buf_select) +{ + struct io_uring_sqe *sqe; + struct io_uring_cqe *cqe; + int open_flags; + int i, fd = -1, ret; + off_t offset; + + if (buf_select) { + write = 0; + fixed = 0; + } + if (buf_select && provide_buffers(ring)) + return 1; + + if (write) + open_flags = O_WRONLY; + else + open_flags = O_RDONLY; + open_flags |= O_DIRECT; + + if (fixed) { + ret = t_register_buffers(ring, vecs, BUFFERS); + if (ret == T_SETUP_SKIP) + return 0; + if (ret != T_SETUP_OK) { + fprintf(stderr, "buffer reg failed: %d\n", ret); + goto err; + } + } + fd = open(file, open_flags); + if (fd < 0) { + if (errno == EINVAL || errno == EPERM || errno == EACCES) + return 0; + perror("file open"); + goto err; + } + if (sqthread) { + ret = io_uring_register_files(ring, &fd, 1); + if (ret) { + fprintf(stderr, "file reg failed: %d\n", ret); + goto err; + } + } + + offset = 0; + for (i = 0; i < BUFFERS; i++) { + sqe = io_uring_get_sqe(ring); + if (!sqe) { + fprintf(stderr, "sqe get failed\n"); + goto err; + } + offset = BS * (rand() % BUFFERS); + if (write) { + int do_fixed = fixed; + int use_fd = fd; + + if (sqthread) + use_fd = 0; + if (fixed && (i & 1)) + do_fixed = 0; + if (do_fixed) { + io_uring_prep_write_fixed(sqe, use_fd, vecs[i].iov_base, + vecs[i].iov_len, + offset, i); + } else { + io_uring_prep_writev(sqe, use_fd, &vecs[i], 1, + offset); + } + } else { + int do_fixed = fixed; + int use_fd = fd; + + if (sqthread) + use_fd = 0; + if (fixed && (i & 1)) + do_fixed = 0; + if (do_fixed) { + io_uring_prep_read_fixed(sqe, use_fd, vecs[i].iov_base, + vecs[i].iov_len, + offset, i); + } else { + io_uring_prep_readv(sqe, use_fd, &vecs[i], 1, + offset); + } + + } + if (sqthread) + sqe->flags |= IOSQE_FIXED_FILE; + if (buf_select) { + sqe->flags |= IOSQE_BUFFER_SELECT; + sqe->buf_group = buf_select; + sqe->user_data = i; + } + } + + ret = io_uring_submit(ring); + if (ret != BUFFERS) { + ret = io_uring_peek_cqe(ring, &cqe); + if (!ret && cqe->res == -EOPNOTSUPP) { + no_iopoll = 1; + io_uring_cqe_seen(ring, cqe); + goto out; + } + fprintf(stderr, "submit got %d, wanted %d\n", ret, BUFFERS); + goto err; + } + + for (i = 0; i < BUFFERS; i++) { + ret = io_uring_wait_cqe(ring, &cqe); + if (ret) { + fprintf(stderr, "wait_cqe=%d\n", ret); + goto err; + } else if (cqe->res == -EOPNOTSUPP) { + fprintf(stdout, "File/device/fs doesn't support polled IO\n"); + no_iopoll = 1; + goto out; + } else if (cqe->res != BS) { + fprintf(stderr, "cqe res %d, wanted %d\n", cqe->res, BS); + goto err; + } + io_uring_cqe_seen(ring, cqe); + } + + if (fixed) { + ret = io_uring_unregister_buffers(ring); + if (ret) { + fprintf(stderr, "buffer unreg failed: %d\n", ret); + goto err; + } + } + if (sqthread) { + ret = io_uring_unregister_files(ring); + if (ret) { + fprintf(stderr, "file unreg failed: %d\n", ret); + goto err; + } + } + +out: + close(fd); + return 0; +err: + if (fd != -1) + close(fd); + return 1; +} + +static int test_iopoll(const char *fname, int write, int sqthread, int fixed, + int buf_select, int defer) +{ + struct io_uring ring; + int ret, ring_flags = IORING_SETUP_IOPOLL | IORING_SETUP_HYBRID_IOPOLL; + + if (no_iopoll) + return 0; + + if (defer) + ring_flags |= IORING_SETUP_SINGLE_ISSUER | + IORING_SETUP_DEFER_TASKRUN; + + ret = t_create_ring(64, &ring, ring_flags); + if (ret == T_SETUP_SKIP) + return 0; + if (ret != T_SETUP_OK) { + fprintf(stderr, "ring create failed: %d\n", ret); + return 1; + } + ret = __test_iopoll_io(fname, &ring, write, sqthread, fixed, buf_select); + io_uring_queue_exit(&ring); + return ret; +} + +int main(int argc, char *argv[]) +{ + int i, ret; + char buf[256]; + char *fname; + + if (argc > 1) { + fname = argv[1]; + } else { + srand((unsigned)time(NULL)); + snprintf(buf, sizeof(buf), ".basic-rw-%u-%u", + (unsigned)rand(), (unsigned)getpid()); + fname = buf; + t_create_file(fname, FILE_SIZE); + } + + vecs = t_create_buffers(BUFFERS, BS); + + for (i = 0; i < 16; i++) { + int write = (i & 1) != 0; + int sqthread = (i & 2) != 0; + int fixed = (i & 4) != 0; + int buf_select = (i & 8) != 0; + int defer = (i & 16) != 0; + int nonvec = buf_select; + + ret = test_iopoll(fname, write, sqthread, fixed, buf_select, defer); + if (ret) { + fprintf(stderr, "test_iopoll_io failed %d/%d/%d/%d/%d\n", + write, sqthread, fixed, buf_select, defer); + goto err; + } + if (no_iopoll) + break; + + ret = test_io_uring_passthrough(fname, i, write, sqthread, fixed, nonvec); + if (no_pt) + break; + if (ret) { + fprintf(stderr, "test_io_uring_passthrough_io failed %d/%d/%d/%d\n", + write, sqthread, fixed, nonvec); + goto err; + } + } + + if (fname != argv[1]) + unlink(fname); + return ret; +err: + if (fname != argv[1]) + unlink(fname); + return T_EXIT_FAIL; +}
Add a test file for hybrid iopoll to make sure it works safe.Testcass include basic read/write tests, and run in normal iopoll mode and passthrough mode respectively. Signed-off-by: hexue <xue01.he@samsung.com> --- man/io_uring_setup.2 | 10 +- src/include/liburing/io_uring.h | 3 + test/Makefile | 1 + test/iopoll-hybridpoll.c | 544 ++++++++++++++++++++++++++++++++ 4 files changed, 557 insertions(+), 1 deletion(-) create mode 100644 test/iopoll-hybridpoll.c