Message ID | 20231027181929.2589937-2-kbusch@meta.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | block integrity: directly map user space addresses | expand |
> +static int bio_integrity_copy_user(struct bio *bio, struct bio_vec *bvec, > + int nr_vecs, unsigned int len, > + unsigned int direction, u32 seed) > +{ > + struct bio_integrity_payload *bip; > + struct bio_vec *copy_vec = NULL; > + struct iov_iter iter; > + void *buf; > + int ret; > + > + /* if bvec is on the stack, we need to allocate a copy for the completion */ > + if (nr_vecs <= UIO_FASTIOV) { > + copy_vec = kcalloc(sizeof(*bvec), nr_vecs, GFP_KERNEL); > + if (!copy_vec) > + return -ENOMEM; > + memcpy(copy_vec, bvec, nr_vecs * sizeof(*bvec)); > + } > + > + buf = kmalloc(len, GFP_KERNEL); > + if (!buf) > + goto free_copy; ret is not set to -ENOMEM here. > + > + if (direction == ITER_SOURCE) { > + iov_iter_bvec(&iter, direction, bvec, nr_vecs, len); > + if (!copy_from_iter_full(buf, len, &iter)) { > + ret = -EFAULT; > + goto free_buf; > + } > + } else { > + memset(buf, 0, len); > + } > + > + /* > + * We just need one vec for this bip, but we need to preserve the > + * number of vecs in the user bvec for the completion handling, so use > + * nr_vecs. > + */ > + bip = bio_integrity_alloc(bio, GFP_KERNEL, nr_vecs); > + if (IS_ERR(bip)) { > + ret = PTR_ERR(bip); > + goto free_buf; > + } > + > + ret = bio_integrity_add_page(bio, virt_to_page(buf), len, > + offset_in_page(buf)); > + if (ret != len) { > + ret = -ENOMEM; > + goto free_bip; > + } > + > + bip->bip_flags |= BIP_INTEGRITY_USER; > + bip->copy_vec = copy_vec ?: bvec; > + return 0; > +free_bip: > + bio_integrity_free(bio); > +free_buf: > + kfree(buf); > +free_copy: > + kfree(copy_vec); > + return ret; > +} > +
On Mon, Oct 30, 2023 at 03:40:47PM +0100, Pankaj Raghav wrote: > > + int ret; > > + > > + /* if bvec is on the stack, we need to allocate a copy for the completion */ > > + if (nr_vecs <= UIO_FASTIOV) { > > + copy_vec = kcalloc(sizeof(*bvec), nr_vecs, GFP_KERNEL); > > + if (!copy_vec) > > + return -ENOMEM; > > + memcpy(copy_vec, bvec, nr_vecs * sizeof(*bvec)); > > + } > > + > > + buf = kmalloc(len, GFP_KERNEL); > > + if (!buf) > > + goto free_copy; > > ret is not set to -ENOMEM here. Indeed, thanks for pointing that out. I'll wait a bit longer before posting a v3.
Hi Keith, kernel test robot noticed the following build errors: [auto build test ERROR on axboe-block/for-next] [also build test ERROR on linus/master v6.6 next-20231030] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch#_base_tree_information] url: https://github.com/intel-lab-lkp/linux/commits/Keith-Busch/block-bio-integrity-directly-map-user-buffers/20231028-022107 base: https://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git for-next patch link: https://lore.kernel.org/r/20231027181929.2589937-2-kbusch%40meta.com patch subject: [PATCHv2 1/4] block: bio-integrity: directly map user buffers config: powerpc-allnoconfig (https://download.01.org/0day-ci/archive/20231030/202310302318.pPweFNME-lkp@intel.com/config) compiler: powerpc-linux-gcc (GCC) 13.2.0 reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20231030/202310302318.pPweFNME-lkp@intel.com/reproduce) If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot <lkp@intel.com> | Closes: https://lore.kernel.org/oe-kbuild-all/202310302318.pPweFNME-lkp@intel.com/ All errors (new ones prefixed by >>): In file included from include/linux/blkdev.h:17, from lib/vsprintf.c:47: include/linux/bio.h: In function 'bio_integrity_map_user': >> include/linux/bio.h:799:1: error: expected ';' before '}' token 799 | } | ^ lib/vsprintf.c: In function 'va_format': lib/vsprintf.c:1682:9: warning: function 'va_format' might be a candidate for 'gnu_printf' format attribute [-Wsuggest-attribute=format] 1682 | buf += vsnprintf(buf, end > buf ? end - buf : 0, va_fmt->fmt, va); | ^~~ vim +799 include/linux/bio.h 794 795 static inline int bio_integrity_map_user(struct bio *bio, void __user *ubuf, 796 unsigned int len, u32 seed) 797 { 798 return -EINVAL > 799 } 800
On 10/27/2023 11:49 PM, Keith Busch wrote: > From: Keith Busch <kbusch@kernel.org> > > Passthrough commands that utilize metadata currently need to bounce the > user space buffer through the kernel. Add support for mapping user space > directly so that we can avoid this costly overhead. This is similiar to > how the normal bio data payload utilizes user addresses with > bio_map_user_iov(). > > If the user address can't directly be used for reasons like too many > segments or address unalignement, fallback to a copy of the user vec > while keeping the user address pinned for the IO duration so that it > can safely be copied on completion in any process context. > > Signed-off-by: Keith Busch <kbusch@kernel.org> > --- > block/bio-integrity.c | 195 ++++++++++++++++++++++++++++++++++++++++++ > include/linux/bio.h | 9 ++ > 2 files changed, 204 insertions(+) > > diff --git a/block/bio-integrity.c b/block/bio-integrity.c > index ec8ac8cf6e1b9..7f9d242ad79df 100644 > --- a/block/bio-integrity.c > +++ b/block/bio-integrity.c > @@ -91,6 +91,37 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, > } > EXPORT_SYMBOL(bio_integrity_alloc); > > +static void bio_integrity_unmap_user(struct bio_integrity_payload *bip) > +{ > + bool dirty = bio_data_dir(bip->bip_bio) == READ; > + struct bio_vec *copy = bip->copy_vec; > + struct bvec_iter iter; > + struct bio_vec bv; > + > + if (copy) { > + unsigned short nr_vecs = bip->bip_max_vcnt; > + size_t bytes = bip->bip_iter.bi_size; > + void *buf = bvec_virt(bip->bip_vec); > + > + if (dirty) { > + struct iov_iter iter; > + > + iov_iter_bvec(&iter, ITER_DEST, copy, nr_vecs, bytes); > + WARN_ON(copy_to_iter(buf, bytes, &iter) != bytes); > + } > + > + memcpy(bip->bip_vec, copy, nr_vecs * sizeof(*copy)); > + kfree(copy); > + kfree(buf); > + } > + > + bip_for_each_vec(bv, bip, iter) { > + if (dirty && !PageCompound(bv.bv_page)) > + set_page_dirty_lock(bv.bv_page); > + unpin_user_page(bv.bv_page); > + } > +} Leak here, page-unpinning loop will not execute for the common (i.e., no-copy) case... > +int bio_integrity_map_user(struct bio *bio, void __user *ubuf, unsigned int len, > + u32 seed) > +{ > + struct request_queue *q = bdev_get_queue(bio->bi_bdev); > + unsigned long offs, align = q->dma_pad_mask | queue_dma_alignment(q); > + int ret, direction, nr_vecs, i, j, folios = 0; > + struct bio_vec stack_vec[UIO_FASTIOV]; > + struct bio_vec bv, *bvec = stack_vec; > + struct page *stack_pages[UIO_FASTIOV]; > + struct page **pages = stack_pages; > + struct bio_integrity_payload *bip; > + struct iov_iter iter; > + struct bvec_iter bi; > + u32 bytes; > + > + if (bio_integrity(bio)) > + return -EINVAL; > + if (len >> SECTOR_SHIFT > queue_max_hw_sectors(q)) > + return -E2BIG; > + > + if (bio_data_dir(bio) == READ) > + direction = ITER_DEST; > + else > + direction = ITER_SOURCE; > + > + iov_iter_ubuf(&iter, direction, ubuf, len); > + nr_vecs = iov_iter_npages(&iter, BIO_MAX_VECS + 1); > + if (nr_vecs > BIO_MAX_VECS) > + return -E2BIG; > + if (nr_vecs > UIO_FASTIOV) { > + bvec = kcalloc(sizeof(*bvec), nr_vecs, GFP_KERNEL); > + if (!bvec) > + return -ENOMEM; > + pages = NULL; > + } > + > + bytes = iov_iter_extract_pages(&iter, &pages, len, nr_vecs, 0, &offs); > + if (unlikely(bytes < 0)) { > + ret = bytes; > + goto free_bvec; > + } > + > + for (i = 0; i < nr_vecs; i = j) { > + size_t size = min_t(size_t, bytes, PAGE_SIZE - offs); > + struct folio *folio = page_folio(pages[i]); > + > + bytes -= size; > + for (j = i + 1; j < nr_vecs; j++) { > + size_t next = min_t(size_t, PAGE_SIZE, bytes); > + > + if (page_folio(pages[j]) != folio || > + pages[j] != pages[j - 1] + 1) > + break; > + unpin_user_page(pages[j]); > + size += next; > + bytes -= next; > + } > + > + bvec_set_page(&bvec[folios], pages[i], size, offs); > + offs = 0; > + folios++; > + } > + > + if (pages != stack_pages) > + kvfree(pages); > + > + if (folios > queue_max_integrity_segments(q) || > + !iov_iter_is_aligned(&iter, align, align)) { > + ret = bio_integrity_copy_user(bio, bvec, folios, len, > + direction, seed); > + if (ret) > + goto release_pages; > + return 0; > + } > + > + bip = bio_integrity_alloc(bio, GFP_KERNEL, folios); > + if (IS_ERR(bip)) { > + ret = PTR_ERR(bip); > + goto release_pages; > + } > + > + memcpy(bip->bip_vec, bvec, folios * sizeof(*bvec)); Because with this way of copying, bip->bip_iter.bi_size will remain zero. Second, is it fine not to have those virt-alignment checks that are done by bvec_gap_to_prev() when the pages are added using bio_integrity_add_page()?
On Tue, Oct 31, 2023 at 02:32:48AM +0530, Kanchan Joshi wrote: > On 10/27/2023 11:49 PM, Keith Busch wrote: > > + > > + bip_for_each_vec(bv, bip, iter) { > > + if (dirty && !PageCompound(bv.bv_page)) > > + set_page_dirty_lock(bv.bv_page); > > + unpin_user_page(bv.bv_page); > > + } > > +} > > Leak here, page-unpinning loop will not execute for the common (i.e., > no-copy) case... > > > + bip = bio_integrity_alloc(bio, GFP_KERNEL, folios); > > + if (IS_ERR(bip)) { > > + ret = PTR_ERR(bip); > > + goto release_pages; > > + } > > + > > + memcpy(bip->bip_vec, bvec, folios * sizeof(*bvec)); > > Because with this way of copying, bip->bip_iter.bi_size will remain zero. Good catch. > Second, is it fine not to have those virt-alignment checks that are done > by bvec_gap_to_prev() when the pages are added using > bio_integrity_add_page()? We're mapping a single user buffer. It's guaranteed to be virtually congiguous, so no gaps.
Hi Keith, kernel test robot noticed the following build warnings: [auto build test WARNING on axboe-block/for-next] [also build test WARNING on linus/master v6.6 next-20231030] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch#_base_tree_information] url: https://github.com/intel-lab-lkp/linux/commits/Keith-Busch/block-bio-integrity-directly-map-user-buffers/20231028-022107 base: https://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git for-next patch link: https://lore.kernel.org/r/20231027181929.2589937-2-kbusch%40meta.com patch subject: [PATCHv2 1/4] block: bio-integrity: directly map user buffers config: x86_64-rhel-8.3-rust (https://download.01.org/0day-ci/archive/20231031/202310310704.l4FwoJDd-lkp@intel.com/config) compiler: clang version 16.0.4 (https://github.com/llvm/llvm-project.git ae42196bc493ffe877a7e3dff8be32035dea4d07) reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20231031/202310310704.l4FwoJDd-lkp@intel.com/reproduce) If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot <lkp@intel.com> | Closes: https://lore.kernel.org/oe-kbuild-all/202310310704.l4FwoJDd-lkp@intel.com/ All warnings (new ones prefixed by >>): >> block/bio-integrity.c:215:6: warning: variable 'ret' is used uninitialized whenever 'if' condition is true [-Wsometimes-uninitialized] if (!buf) ^~~~ block/bio-integrity.c:255:9: note: uninitialized use occurs here return ret; ^~~ block/bio-integrity.c:215:2: note: remove the 'if' if its condition is always false if (!buf) ^~~~~~~~~ block/bio-integrity.c:204:9: note: initialize the variable 'ret' to silence this warning int ret; ^ = 0 1 warning generated. vim +215 block/bio-integrity.c 195 196 static int bio_integrity_copy_user(struct bio *bio, struct bio_vec *bvec, 197 int nr_vecs, unsigned int len, 198 unsigned int direction, u32 seed) 199 { 200 struct bio_integrity_payload *bip; 201 struct bio_vec *copy_vec = NULL; 202 struct iov_iter iter; 203 void *buf; 204 int ret; 205 206 /* if bvec is on the stack, we need to allocate a copy for the completion */ 207 if (nr_vecs <= UIO_FASTIOV) { 208 copy_vec = kcalloc(sizeof(*bvec), nr_vecs, GFP_KERNEL); 209 if (!copy_vec) 210 return -ENOMEM; 211 memcpy(copy_vec, bvec, nr_vecs * sizeof(*bvec)); 212 } 213 214 buf = kmalloc(len, GFP_KERNEL); > 215 if (!buf) 216 goto free_copy; 217 218 if (direction == ITER_SOURCE) { 219 iov_iter_bvec(&iter, direction, bvec, nr_vecs, len); 220 if (!copy_from_iter_full(buf, len, &iter)) { 221 ret = -EFAULT; 222 goto free_buf; 223 } 224 } else { 225 memset(buf, 0, len); 226 } 227 228 /* 229 * We just need one vec for this bip, but we need to preserve the 230 * number of vecs in the user bvec for the completion handling, so use 231 * nr_vecs. 232 */ 233 bip = bio_integrity_alloc(bio, GFP_KERNEL, nr_vecs); 234 if (IS_ERR(bip)) { 235 ret = PTR_ERR(bip); 236 goto free_buf; 237 } 238 239 ret = bio_integrity_add_page(bio, virt_to_page(buf), len, 240 offset_in_page(buf)); 241 if (ret != len) { 242 ret = -ENOMEM; 243 goto free_bip; 244 } 245 246 bip->bip_flags |= BIP_INTEGRITY_USER; 247 bip->copy_vec = copy_vec ?: bvec; 248 return 0; 249 free_bip: 250 bio_integrity_free(bio); 251 free_buf: 252 kfree(buf); 253 free_copy: 254 kfree(copy_vec); 255 return ret; 256 } 257
Hi Keith, kernel test robot noticed the following build errors: [auto build test ERROR on axboe-block/for-next] [also build test ERROR on linus/master v6.6 next-20231030] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch#_base_tree_information] url: https://github.com/intel-lab-lkp/linux/commits/Keith-Busch/block-bio-integrity-directly-map-user-buffers/20231028-022107 base: https://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git for-next patch link: https://lore.kernel.org/r/20231027181929.2589937-2-kbusch%40meta.com patch subject: [PATCHv2 1/4] block: bio-integrity: directly map user buffers config: mips-allmodconfig (https://download.01.org/0day-ci/archive/20231031/202310311041.38ISTxlo-lkp@intel.com/config) compiler: mips-linux-gcc (GCC) 13.2.0 reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20231031/202310311041.38ISTxlo-lkp@intel.com/reproduce) If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot <lkp@intel.com> | Closes: https://lore.kernel.org/oe-kbuild-all/202310311041.38ISTxlo-lkp@intel.com/ All errors (new ones prefixed by >>): block/bio-integrity.c: In function 'bio_integrity_map_user': >> block/bio-integrity.c:294:72: error: passing argument 6 of 'iov_iter_extract_pages' from incompatible pointer type [-Werror=incompatible-pointer-types] 294 | bytes = iov_iter_extract_pages(&iter, &pages, len, nr_vecs, 0, &offs); | ^~~~~ | | | long unsigned int * In file included from include/linux/bio.h:11, from include/linux/blkdev.h:17, from include/linux/blk-mq.h:5, from include/linux/blk-integrity.h:5, from block/bio-integrity.c:9: include/linux/uio.h:400:40: note: expected 'size_t *' {aka 'unsigned int *'} but argument is of type 'long unsigned int *' 400 | size_t *offset0); | ~~~~~~~~^~~~~~~ cc1: some warnings being treated as errors vim +/iov_iter_extract_pages +294 block/bio-integrity.c 257 258 int bio_integrity_map_user(struct bio *bio, void __user *ubuf, unsigned int len, 259 u32 seed) 260 { 261 struct request_queue *q = bdev_get_queue(bio->bi_bdev); 262 unsigned long offs, align = q->dma_pad_mask | queue_dma_alignment(q); 263 int ret, direction, nr_vecs, i, j, folios = 0; 264 struct bio_vec stack_vec[UIO_FASTIOV]; 265 struct bio_vec bv, *bvec = stack_vec; 266 struct page *stack_pages[UIO_FASTIOV]; 267 struct page **pages = stack_pages; 268 struct bio_integrity_payload *bip; 269 struct iov_iter iter; 270 struct bvec_iter bi; 271 u32 bytes; 272 273 if (bio_integrity(bio)) 274 return -EINVAL; 275 if (len >> SECTOR_SHIFT > queue_max_hw_sectors(q)) 276 return -E2BIG; 277 278 if (bio_data_dir(bio) == READ) 279 direction = ITER_DEST; 280 else 281 direction = ITER_SOURCE; 282 283 iov_iter_ubuf(&iter, direction, ubuf, len); 284 nr_vecs = iov_iter_npages(&iter, BIO_MAX_VECS + 1); 285 if (nr_vecs > BIO_MAX_VECS) 286 return -E2BIG; 287 if (nr_vecs > UIO_FASTIOV) { 288 bvec = kcalloc(sizeof(*bvec), nr_vecs, GFP_KERNEL); 289 if (!bvec) 290 return -ENOMEM; 291 pages = NULL; 292 } 293 > 294 bytes = iov_iter_extract_pages(&iter, &pages, len, nr_vecs, 0, &offs); 295 if (unlikely(bytes < 0)) { 296 ret = bytes; 297 goto free_bvec; 298 } 299 300 for (i = 0; i < nr_vecs; i = j) { 301 size_t size = min_t(size_t, bytes, PAGE_SIZE - offs); 302 struct folio *folio = page_folio(pages[i]); 303 304 bytes -= size; 305 for (j = i + 1; j < nr_vecs; j++) { 306 size_t next = min_t(size_t, PAGE_SIZE, bytes); 307 308 if (page_folio(pages[j]) != folio || 309 pages[j] != pages[j - 1] + 1) 310 break; 311 unpin_user_page(pages[j]); 312 size += next; 313 bytes -= next; 314 } 315 316 bvec_set_page(&bvec[folios], pages[i], size, offs); 317 offs = 0; 318 folios++; 319 } 320 321 if (pages != stack_pages) 322 kvfree(pages); 323 324 if (folios > queue_max_integrity_segments(q) || 325 !iov_iter_is_aligned(&iter, align, align)) { 326 ret = bio_integrity_copy_user(bio, bvec, folios, len, 327 direction, seed); 328 if (ret) 329 goto release_pages; 330 return 0; 331 } 332 333 bip = bio_integrity_alloc(bio, GFP_KERNEL, folios); 334 if (IS_ERR(bip)) { 335 ret = PTR_ERR(bip); 336 goto release_pages; 337 } 338 339 memcpy(bip->bip_vec, bvec, folios * sizeof(*bvec)); 340 if (bvec != stack_vec) 341 kfree(bvec); 342 343 bip->bip_flags |= BIP_INTEGRITY_USER; 344 bip->copy_vec = NULL; 345 return 0; 346 347 release_pages: 348 bi.bi_size = len; 349 for_each_bvec(bv, bvec, bi, bi) 350 unpin_user_page(bv.bv_page); 351 free_bvec: 352 if (bvec != stack_vec) 353 kfree(bvec); 354 return ret; 355 } 356 EXPORT_SYMBOL_GPL(bio_integrity_map_user); 357
On 10/27/2023 11:49 PM, Keith Busch wrote: > +int bio_integrity_map_user(struct bio *bio, void __user *ubuf, unsigned int len, > + u32 seed) > +{ > + struct request_queue *q = bdev_get_queue(bio->bi_bdev); > + unsigned long offs, align = q->dma_pad_mask | queue_dma_alignment(q); > + int ret, direction, nr_vecs, i, j, folios = 0; > + struct bio_vec stack_vec[UIO_FASTIOV]; > + struct bio_vec bv, *bvec = stack_vec; > + struct page *stack_pages[UIO_FASTIOV]; > + struct page **pages = stack_pages; > + struct bio_integrity_payload *bip; > + struct iov_iter iter; > + struct bvec_iter bi; > + u32 bytes; > + > + if (bio_integrity(bio)) > + return -EINVAL; > + if (len >> SECTOR_SHIFT > queue_max_hw_sectors(q)) > + return -E2BIG; > + > + if (bio_data_dir(bio) == READ) > + direction = ITER_DEST; > + else > + direction = ITER_SOURCE; > + > + iov_iter_ubuf(&iter, direction, ubuf, len); > + nr_vecs = iov_iter_npages(&iter, BIO_MAX_VECS + 1); > + if (nr_vecs > BIO_MAX_VECS) > + return -E2BIG; > + if (nr_vecs > UIO_FASTIOV) { > + bvec = kcalloc(sizeof(*bvec), nr_vecs, GFP_KERNEL); > + if (!bvec) > + return -ENOMEM; > + pages = NULL; > + } > + > + bytes = iov_iter_extract_pages(&iter, &pages, len, nr_vecs, 0, &offs); > + if (unlikely(bytes < 0)) { > + ret = bytes; > + goto free_bvec; > + } > + > + for (i = 0; i < nr_vecs; i = j) { > + size_t size = min_t(size_t, bytes, PAGE_SIZE - offs); > + struct folio *folio = page_folio(pages[i]); > + > + bytes -= size; > + for (j = i + 1; j < nr_vecs; j++) { > + size_t next = min_t(size_t, PAGE_SIZE, bytes); > + > + if (page_folio(pages[j]) != folio || > + pages[j] != pages[j - 1] + 1) > + break; > + unpin_user_page(pages[j]); Is this unpin correct here?
On Mon, Nov 06, 2023 at 11:18:03AM +0530, Kanchan Joshi wrote: > On 10/27/2023 11:49 PM, Keith Busch wrote: > > + for (i = 0; i < nr_vecs; i = j) { > > + size_t size = min_t(size_t, bytes, PAGE_SIZE - offs); > > + struct folio *folio = page_folio(pages[i]); > > + > > + bytes -= size; > > + for (j = i + 1; j < nr_vecs; j++) { > > + size_t next = min_t(size_t, PAGE_SIZE, bytes); > > + > > + if (page_folio(pages[j]) != folio || > > + pages[j] != pages[j - 1] + 1) > > + break; > > + unpin_user_page(pages[j]); > > Is this unpin correct here? Should be. The pages are bound to the folio, so this doesn't really unpin the user page. It just drops a reference, and the folio holds the final reference to the contiguous pages, which is released on completion. You can find the same idea in io_uring/rscs.c, io_sqe_buffer_register().
On 11/6/2023 8:32 PM, Keith Busch wrote: > On Mon, Nov 06, 2023 at 11:18:03AM +0530, Kanchan Joshi wrote: >> On 10/27/2023 11:49 PM, Keith Busch wrote: >>> + for (i = 0; i < nr_vecs; i = j) { >>> + size_t size = min_t(size_t, bytes, PAGE_SIZE - offs); >>> + struct folio *folio = page_folio(pages[i]); >>> + >>> + bytes -= size; >>> + for (j = i + 1; j < nr_vecs; j++) { >>> + size_t next = min_t(size_t, PAGE_SIZE, bytes); >>> + >>> + if (page_folio(pages[j]) != folio || >>> + pages[j] != pages[j - 1] + 1) >>> + break; >>> + unpin_user_page(pages[j]); >> >> Is this unpin correct here? > > Should be. The pages are bound to the folio, so this doesn't really > unpin the user page. It just drops a reference, and the folio holds the > final reference to the contiguous pages, which is released on > completion. But the completion is still going to see multiple pages and not one (folio). The bip_for_each_vec loop is going to drop the reference again. I suspect it is not folio-aware.
On Tue, Nov 07, 2023 at 03:55:14PM +0530, Kanchan Joshi wrote: > On 11/6/2023 8:32 PM, Keith Busch wrote: > > On Mon, Nov 06, 2023 at 11:18:03AM +0530, Kanchan Joshi wrote: > >> On 10/27/2023 11:49 PM, Keith Busch wrote: > >>> + for (i = 0; i < nr_vecs; i = j) { > >>> + size_t size = min_t(size_t, bytes, PAGE_SIZE - offs); > >>> + struct folio *folio = page_folio(pages[i]); > >>> + > >>> + bytes -= size; > >>> + for (j = i + 1; j < nr_vecs; j++) { > >>> + size_t next = min_t(size_t, PAGE_SIZE, bytes); > >>> + > >>> + if (page_folio(pages[j]) != folio || > >>> + pages[j] != pages[j - 1] + 1) > >>> + break; > >>> + unpin_user_page(pages[j]); > >> > >> Is this unpin correct here? > > > > Should be. The pages are bound to the folio, so this doesn't really > > unpin the user page. It just drops a reference, and the folio holds the > > final reference to the contiguous pages, which is released on > > completion. > > But the completion is still going to see multiple pages and not one > (folio). The bip_for_each_vec loop is going to drop the reference again. > I suspect it is not folio-aware. The completion unpins once per bvec, not individual pages. The setup creates multipage bvecs with only one pin remaining per bvec for all of the bvec's pages. If a page can't be merged into the current bvec, then that page is not unpinned and becomes the first page of to the next bvec.
On 11/7/2023 8:38 PM, Keith Busch wrote: > On Tue, Nov 07, 2023 at 03:55:14PM +0530, Kanchan Joshi wrote: >> On 11/6/2023 8:32 PM, Keith Busch wrote: >>> On Mon, Nov 06, 2023 at 11:18:03AM +0530, Kanchan Joshi wrote: >>>> On 10/27/2023 11:49 PM, Keith Busch wrote: >>>>> + for (i = 0; i < nr_vecs; i = j) { >>>>> + size_t size = min_t(size_t, bytes, PAGE_SIZE - offs); >>>>> + struct folio *folio = page_folio(pages[i]); >>>>> + >>>>> + bytes -= size; >>>>> + for (j = i + 1; j < nr_vecs; j++) { >>>>> + size_t next = min_t(size_t, PAGE_SIZE, bytes); >>>>> + >>>>> + if (page_folio(pages[j]) != folio || >>>>> + pages[j] != pages[j - 1] + 1) >>>>> + break; >>>>> + unpin_user_page(pages[j]); >>>> >>>> Is this unpin correct here? >>> >>> Should be. The pages are bound to the folio, so this doesn't really >>> unpin the user page. It just drops a reference, and the folio holds the >>> final reference to the contiguous pages, which is released on >>> completion. >> >> But the completion is still going to see multiple pages and not one >> (folio). The bip_for_each_vec loop is going to drop the reference again. >> I suspect it is not folio-aware. > > The completion unpins once per bvec, not individual pages. The setup > creates multipage bvecs with only one pin remaining per bvec for all of > the bvec's pages. If a page can't be merged into the current bvec, then > that page is not unpinned and becomes the first page of to the next > bvec. > Here is a test program [2] that creates this scenario. Single 8KB+16b read on a 4KB+8b formatted namespace. It prepares meta-buffer out of a huge-page in a way that it spans two regular 4K pages. With this, I see more unpins than expected. And I had added this [1] also on top of your patch. [1] @@ -339,7 +367,22 @@ int bio_integrity_map_user(struct bio *bio, void __user *ubuf, unsigned int len, memcpy(bip->bip_vec, bvec, folios * sizeof(*bvec)); if (bvec != stack_vec) kfree(bvec); + // quick fix for completion + bip->bip_vcnt = folios; + bip->bip_iter.bi_size = len; [2] #define _GNU_SOURCE #include <unistd.h> #include <stdlib.h> #include <string.h> #include <sys/mman.h> #include <liburing.h> #include <libnvme.h> #define DEV "/dev/ng0n1" #define NSID 1 #define DBCNT 2 #define DATA_BUFLEN (4096 * DBCNT) #define OFFSET 0 #define LBA_SHIFT 12 /* This assumes 4K + 8b lba format */ #define MD_BUFLEN (8 * DBCNT) #define MD_OFFSET (4096 - 8) #define HP_SIZE (2*2*1024*1024) /*Two 2M pages*/ #define APPTAG_MASK (0xFFFF) #define APPTAG (0x8888) void *alloc_meta_buf_hp() { void *ptr; ptr = mmap(NULL, HP_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_HUGETLB, -1, 0); if (ptr == MAP_FAILED) return NULL; return ptr; } void free_meta_buf(void *ptr) { munmap(ptr, HP_SIZE); } int main() { struct io_uring ring; struct io_uring_cqe *cqe; struct io_uring_sqe *sqe; struct io_uring_params p = { }; int fd, ret; struct nvme_uring_cmd *cmd; void *buffer, *md_buf; __u64 slba; __u16 nlb; ret = posix_memalign(&buffer, DATA_BUFLEN, DATA_BUFLEN); if (ret) { fprintf(stderr, "data buffer allocation failed: %d\n", ret); return 1; } memset(buffer, 'x', DATA_BUFLEN); md_buf = alloc_meta_buf_hp(); if (!md_buf) { fprintf(stderr, "meta buffer allocation failed: %d\n", ret); return 1; } p.flags = IORING_SETUP_CQE32 | IORING_SETUP_SQE128; ret = io_uring_queue_init_params(4, &ring, &p); if (ret) { fprintf(stderr, "ring create failed: %d\n", ret); return 1; } fd = open(DEV, O_RDWR); if (fd < 0) { perror("file open"); exit(1); } sqe = io_uring_get_sqe(&ring); io_uring_prep_read(sqe, fd, buffer, DATA_BUFLEN, OFFSET); sqe->cmd_op = NVME_URING_CMD_IO; sqe->opcode = IORING_OP_URING_CMD; sqe->user_data = 1234; cmd = (struct nvme_uring_cmd *)sqe->cmd; memset(cmd, 0, sizeof(struct nvme_uring_cmd)); cmd->opcode = nvme_cmd_read; cmd->addr = (__u64)(uintptr_t)buffer; cmd->data_len = DATA_BUFLEN; cmd->nsid = NSID; slba = OFFSET >> LBA_SHIFT; nlb = (DATA_BUFLEN >> LBA_SHIFT) - 1; cmd->cdw10 = slba & 0xffffffff; cmd->cdw11 = slba >> 32; cmd->cdw12 = nlb; /* set the pract and prchk (Guard, App, RefTag) bits in cdw12 */ //cmd->cdw12 |= 15 << 26; cmd->cdw12 |= 7 << 26; cmd->metadata = ((__u64)(uintptr_t)md_buf) + MD_OFFSET; cmd->metadata_len = MD_BUFLEN; /* reftag */ cmd->cdw14 = (__u32)slba; /* apptag mask and apptag */ cmd->cdw15 = APPTAG_MASK << 16 | APPTAG; ret = io_uring_submit(&ring); if (ret != 1) { fprintf(stderr, "submit got %d, wanted %d\n", ret, 1); goto err; } ret = io_uring_wait_cqe(&ring, &cqe); if (ret) { fprintf(stderr, "wait_cqe=%d\n", ret); goto err; } if (cqe->res != 0) { fprintf(stderr, "cqe res %d, wanted success\n", cqe->res); goto err; } io_uring_cqe_seen(&ring, cqe); free_meta_buf(md_buf); close(fd); io_uring_queue_exit(&ring); return 0; err: if (fd != -1) close(fd); io_uring_queue_exit(&ring); return 1; }
On Wed, Nov 08, 2023 at 05:45:19PM +0530, Kanchan Joshi wrote: > On 11/7/2023 8:38 PM, Keith Busch wrote: > > On Tue, Nov 07, 2023 at 03:55:14PM +0530, Kanchan Joshi wrote: > >> On 11/6/2023 8:32 PM, Keith Busch wrote: > >>> On Mon, Nov 06, 2023 at 11:18:03AM +0530, Kanchan Joshi wrote: > >>>> On 10/27/2023 11:49 PM, Keith Busch wrote: > >>>>> + for (i = 0; i < nr_vecs; i = j) { > >>>>> + size_t size = min_t(size_t, bytes, PAGE_SIZE - offs); > >>>>> + struct folio *folio = page_folio(pages[i]); > >>>>> + > >>>>> + bytes -= size; > >>>>> + for (j = i + 1; j < nr_vecs; j++) { > >>>>> + size_t next = min_t(size_t, PAGE_SIZE, bytes); > >>>>> + > >>>>> + if (page_folio(pages[j]) != folio || > >>>>> + pages[j] != pages[j - 1] + 1) > >>>>> + break; > >>>>> + unpin_user_page(pages[j]); > >>>> > >>>> Is this unpin correct here? > >>> > >>> Should be. The pages are bound to the folio, so this doesn't really > >>> unpin the user page. It just drops a reference, and the folio holds the > >>> final reference to the contiguous pages, which is released on > >>> completion. > >> > >> But the completion is still going to see multiple pages and not one > >> (folio). The bip_for_each_vec loop is going to drop the reference again. > >> I suspect it is not folio-aware. > > > > The completion unpins once per bvec, not individual pages. The setup > > creates multipage bvecs with only one pin remaining per bvec for all of > > the bvec's pages. If a page can't be merged into the current bvec, then > > that page is not unpinned and becomes the first page of to the next > > bvec. > > > > Here is a test program [2] that creates this scenario. > Single 8KB+16b read on a 4KB+8b formatted namespace. It prepares > meta-buffer out of a huge-page in a way that it spans two regular 4K pages. > With this, I see more unpins than expected. I understand now. The bip_for_each_bvec is using single page vector iterators. Will fix it up for next time.
diff --git a/block/bio-integrity.c b/block/bio-integrity.c index ec8ac8cf6e1b9..7f9d242ad79df 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -91,6 +91,37 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, } EXPORT_SYMBOL(bio_integrity_alloc); +static void bio_integrity_unmap_user(struct bio_integrity_payload *bip) +{ + bool dirty = bio_data_dir(bip->bip_bio) == READ; + struct bio_vec *copy = bip->copy_vec; + struct bvec_iter iter; + struct bio_vec bv; + + if (copy) { + unsigned short nr_vecs = bip->bip_max_vcnt; + size_t bytes = bip->bip_iter.bi_size; + void *buf = bvec_virt(bip->bip_vec); + + if (dirty) { + struct iov_iter iter; + + iov_iter_bvec(&iter, ITER_DEST, copy, nr_vecs, bytes); + WARN_ON(copy_to_iter(buf, bytes, &iter) != bytes); + } + + memcpy(bip->bip_vec, copy, nr_vecs * sizeof(*copy)); + kfree(copy); + kfree(buf); + } + + bip_for_each_vec(bv, bip, iter) { + if (dirty && !PageCompound(bv.bv_page)) + set_page_dirty_lock(bv.bv_page); + unpin_user_page(bv.bv_page); + } +} + /** * bio_integrity_free - Free bio integrity payload * @bio: bio containing bip to be freed @@ -105,6 +136,8 @@ void bio_integrity_free(struct bio *bio) if (bip->bip_flags & BIP_BLOCK_INTEGRITY) kfree(bvec_virt(bip->bip_vec)); + else if (bip->bip_flags & BIP_INTEGRITY_USER) + bio_integrity_unmap_user(bip); __bio_integrity_free(bs, bip); bio->bi_integrity = NULL; @@ -160,6 +193,168 @@ int bio_integrity_add_page(struct bio *bio, struct page *page, } EXPORT_SYMBOL(bio_integrity_add_page); +static int bio_integrity_copy_user(struct bio *bio, struct bio_vec *bvec, + int nr_vecs, unsigned int len, + unsigned int direction, u32 seed) +{ + struct bio_integrity_payload *bip; + struct bio_vec *copy_vec = NULL; + struct iov_iter iter; + void *buf; + int ret; + + /* if bvec is on the stack, we need to allocate a copy for the completion */ + if (nr_vecs <= UIO_FASTIOV) { + copy_vec = kcalloc(sizeof(*bvec), nr_vecs, GFP_KERNEL); + if (!copy_vec) + return -ENOMEM; + memcpy(copy_vec, bvec, nr_vecs * sizeof(*bvec)); + } + + buf = kmalloc(len, GFP_KERNEL); + if (!buf) + goto free_copy; + + if (direction == ITER_SOURCE) { + iov_iter_bvec(&iter, direction, bvec, nr_vecs, len); + if (!copy_from_iter_full(buf, len, &iter)) { + ret = -EFAULT; + goto free_buf; + } + } else { + memset(buf, 0, len); + } + + /* + * We just need one vec for this bip, but we need to preserve the + * number of vecs in the user bvec for the completion handling, so use + * nr_vecs. + */ + bip = bio_integrity_alloc(bio, GFP_KERNEL, nr_vecs); + if (IS_ERR(bip)) { + ret = PTR_ERR(bip); + goto free_buf; + } + + ret = bio_integrity_add_page(bio, virt_to_page(buf), len, + offset_in_page(buf)); + if (ret != len) { + ret = -ENOMEM; + goto free_bip; + } + + bip->bip_flags |= BIP_INTEGRITY_USER; + bip->copy_vec = copy_vec ?: bvec; + return 0; +free_bip: + bio_integrity_free(bio); +free_buf: + kfree(buf); +free_copy: + kfree(copy_vec); + return ret; +} + +int bio_integrity_map_user(struct bio *bio, void __user *ubuf, unsigned int len, + u32 seed) +{ + struct request_queue *q = bdev_get_queue(bio->bi_bdev); + unsigned long offs, align = q->dma_pad_mask | queue_dma_alignment(q); + int ret, direction, nr_vecs, i, j, folios = 0; + struct bio_vec stack_vec[UIO_FASTIOV]; + struct bio_vec bv, *bvec = stack_vec; + struct page *stack_pages[UIO_FASTIOV]; + struct page **pages = stack_pages; + struct bio_integrity_payload *bip; + struct iov_iter iter; + struct bvec_iter bi; + u32 bytes; + + if (bio_integrity(bio)) + return -EINVAL; + if (len >> SECTOR_SHIFT > queue_max_hw_sectors(q)) + return -E2BIG; + + if (bio_data_dir(bio) == READ) + direction = ITER_DEST; + else + direction = ITER_SOURCE; + + iov_iter_ubuf(&iter, direction, ubuf, len); + nr_vecs = iov_iter_npages(&iter, BIO_MAX_VECS + 1); + if (nr_vecs > BIO_MAX_VECS) + return -E2BIG; + if (nr_vecs > UIO_FASTIOV) { + bvec = kcalloc(sizeof(*bvec), nr_vecs, GFP_KERNEL); + if (!bvec) + return -ENOMEM; + pages = NULL; + } + + bytes = iov_iter_extract_pages(&iter, &pages, len, nr_vecs, 0, &offs); + if (unlikely(bytes < 0)) { + ret = bytes; + goto free_bvec; + } + + for (i = 0; i < nr_vecs; i = j) { + size_t size = min_t(size_t, bytes, PAGE_SIZE - offs); + struct folio *folio = page_folio(pages[i]); + + bytes -= size; + for (j = i + 1; j < nr_vecs; j++) { + size_t next = min_t(size_t, PAGE_SIZE, bytes); + + if (page_folio(pages[j]) != folio || + pages[j] != pages[j - 1] + 1) + break; + unpin_user_page(pages[j]); + size += next; + bytes -= next; + } + + bvec_set_page(&bvec[folios], pages[i], size, offs); + offs = 0; + folios++; + } + + if (pages != stack_pages) + kvfree(pages); + + if (folios > queue_max_integrity_segments(q) || + !iov_iter_is_aligned(&iter, align, align)) { + ret = bio_integrity_copy_user(bio, bvec, folios, len, + direction, seed); + if (ret) + goto release_pages; + return 0; + } + + bip = bio_integrity_alloc(bio, GFP_KERNEL, folios); + if (IS_ERR(bip)) { + ret = PTR_ERR(bip); + goto release_pages; + } + + memcpy(bip->bip_vec, bvec, folios * sizeof(*bvec)); + if (bvec != stack_vec) + kfree(bvec); + + bip->bip_flags |= BIP_INTEGRITY_USER; + bip->copy_vec = NULL; + return 0; + +release_pages: + bi.bi_size = len; + for_each_bvec(bv, bvec, bi, bi) + unpin_user_page(bv.bv_page); +free_bvec: + if (bvec != stack_vec) + kfree(bvec); + return ret; +} +EXPORT_SYMBOL_GPL(bio_integrity_map_user); + /** * bio_integrity_process - Process integrity metadata for a bio * @bio: bio to generate/verify integrity metadata for diff --git a/include/linux/bio.h b/include/linux/bio.h index 41d417ee13499..2b4a0de838ed1 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -324,6 +324,7 @@ enum bip_flags { BIP_CTRL_NOCHECK = 1 << 2, /* disable HBA integrity checking */ BIP_DISK_NOCHECK = 1 << 3, /* disable disk integrity checking */ BIP_IP_CHECKSUM = 1 << 4, /* IP checksum */ + BIP_INTEGRITY_USER = 1 << 5, /* Integrity payload is user address */ }; /* @@ -342,6 +343,7 @@ struct bio_integrity_payload { struct work_struct bip_work; /* I/O completion */ + struct bio_vec *copy_vec; /* for bounce buffering */ struct bio_vec *bip_vec; struct bio_vec bip_inline_vecs[];/* embedded bvec array */ }; @@ -720,6 +722,7 @@ static inline bool bioset_initialized(struct bio_set *bs) extern struct bio_integrity_payload *bio_integrity_alloc(struct bio *, gfp_t, unsigned int); extern int bio_integrity_add_page(struct bio *, struct page *, unsigned int, unsigned int); +extern int bio_integrity_map_user(struct bio *, void __user *, unsigned int, u32); extern bool bio_integrity_prep(struct bio *); extern void bio_integrity_advance(struct bio *, unsigned int); extern void bio_integrity_trim(struct bio *); @@ -789,6 +792,12 @@ static inline int bio_integrity_add_page(struct bio *bio, struct page *page, return 0; } +static inline int bio_integrity_map_user(struct bio *bio, void __user *ubuf, + unsigned int len, u32 seed) +{ + return -EINVAL +} + #endif /* CONFIG_BLK_DEV_INTEGRITY */ /*