@@ -55,7 +55,8 @@
| UBLK_F_USER_RECOVERY \
| UBLK_F_USER_RECOVERY_REISSUE \
| UBLK_F_UNPRIVILEGED_DEV \
- | UBLK_F_CMD_IOCTL_ENCODE)
+ | UBLK_F_CMD_IOCTL_ENCODE \
+ | UBLK_F_USER_COPY)
/* All UBLK_PARAM_TYPE_* should be included here */
#define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | \
@@ -311,9 +312,18 @@ static int ublk_apply_params(struct ublk_device *ub)
return 0;
}
+static inline bool ublk_support_user_copy(const struct ublk_queue *ubq)
+{
+ return ubq->flags & UBLK_F_USER_COPY;
+}
+
static inline bool ublk_need_req_ref(const struct ublk_queue *ubq)
{
- return false;
+ /*
+ * read()/write() is involved in user copy, so request reference
+ * has to be grabbed
+ */
+ return ublk_support_user_copy(ubq);
}
static inline void ublk_init_req_ref(const struct ublk_queue *ubq,
@@ -590,6 +600,9 @@ static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req,
{
const unsigned int rq_bytes = blk_rq_bytes(req);
+ if (ublk_support_user_copy(ubq))
+ return rq_bytes;
+
/*
* no zero copy, we delay copy WRITE request data into ublksrv
* context and the big benefit is that pinning pages in current
@@ -614,6 +627,9 @@ static int ublk_unmap_io(const struct ublk_queue *ubq,
{
const unsigned int rq_bytes = blk_rq_bytes(req);
+ if (ublk_support_user_copy(ubq))
+ return rq_bytes;
+
if (ublk_need_unmap_req(req)) {
struct iov_iter iter;
struct iovec iov;
@@ -1397,6 +1413,11 @@ static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
^ (_IOC_NR(cmd_op) == UBLK_IO_NEED_GET_DATA))
goto out;
+ if (ublk_support_user_copy(ubq) && ub_cmd->addr) {
+ ret = -EINVAL;
+ goto out;
+ }
+
ret = ublk_check_cmd_op(cmd_op);
if (ret)
goto out;
@@ -1415,23 +1436,34 @@ static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
*/
if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)
goto out;
- /* FETCH_RQ has to provide IO buffer if NEED GET DATA is not enabled */
- if (!ub_cmd->addr && !ublk_need_get_data(ubq))
- goto out;
+
+ if (!ublk_support_user_copy(ubq)) {
+ /*
+ * FETCH_RQ has to provide IO buffer if NEED GET
+ * DATA is not enabled
+ */
+ if (!ub_cmd->addr && !ublk_need_get_data(ubq))
+ goto out;
+ }
ublk_fill_io(io, cmd, ub_cmd->addr);
ublk_mark_io_ready(ub, ubq);
break;
case UBLK_IO_COMMIT_AND_FETCH_REQ:
req = blk_mq_tag_to_rq(ub->tag_set.tags[ub_cmd->q_id], tag);
- /*
- * COMMIT_AND_FETCH_REQ has to provide IO buffer if NEED GET DATA is
- * not enabled or it is Read IO.
- */
- if (!ub_cmd->addr && (!ublk_need_get_data(ubq) || req_op(req) == REQ_OP_READ))
- goto out;
+
if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
goto out;
+
+ if (!ublk_support_user_copy(ubq)) {
+ /*
+ * COMMIT_AND_FETCH_REQ has to provide IO buffer if
+ * NEED GET DATA is not enabled or it is Read IO.
+ */
+ if (!ub_cmd->addr && (!ublk_need_get_data(ubq) ||
+ req_op(req) == REQ_OP_READ))
+ goto out;
+ }
ublk_fill_io(io, cmd, ub_cmd->addr);
ublk_commit_completion(ub, ub_cmd);
break;
@@ -1944,6 +1976,9 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
ub->dev_info.flags |= UBLK_F_CMD_IOCTL_ENCODE |
UBLK_F_URING_CMD_COMP_IN_TASK;
+ if (ub->dev_info.flags & UBLK_F_USER_COPY)
+ ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA;
+
/* We are not ready to support zero copy */
ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY;
@@ -165,6 +165,9 @@
/* use ioctl encoding for uring command */
#define UBLK_F_CMD_IOCTL_ENCODE (1UL << 6)
+/* Copy between request and user buffer by pread()/pwrite() */
+#define UBLK_F_USER_COPY (1UL << 7)
+
/* device state */
#define UBLK_S_DEV_DEAD 0
#define UBLK_S_DEV_LIVE 1
Currently copy between io request buffer(pages) and userspace buffer is done inside ublk_map_io() or ublk_unmap_io(). This way performs very well in case of pre-allocated userspace io buffer. For dynamically allocated or external userspace backend io buffer, UBLK_F_NEED_GET_DATA is added for ublk server to provide buffer by one extra command communication for WRITE request. For READ, userspace simply provides buffer, but can't know when the buffer is done[1]. Add UBLK_F_USER_COPY by moving io data copy out of kernel by providing read()/write() on /dev/ublkcN, and simply let ublk server do the io data copy. This way makes both side cleaner, the cost is that one extra syscall for copy io data between request and backend buffer. With UBLK_F_USER_COPY, it actually becomes possible to run per-io zero copy now, such as, only do zero copy for big size IO, so it can be thought as one prep patch for supporting zero copy. Meantime zero copy still needs to expose read()/write() buffer for some corner case, such as passthrough IO. [1] READ buffer in UBLK_F_NEED_GET_DATA https://lore.kernel.org/linux-block/116d8a56-0881-56d3-9bcc-78ff3e1dc4e5@linux.alibaba.com/T/#m23bd4b8634c0a054e6797063167b469949a247bb Signed-off-by: Ming Lei <ming.lei@redhat.com> --- drivers/block/ublk_drv.c | 57 ++++++++++++++++++++++++++++------- include/uapi/linux/ublk_cmd.h | 3 ++ 2 files changed, 49 insertions(+), 11 deletions(-)