Message ID | 20190627024910.23987-2-damien.lemoal@wdc.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | Fix zone revalidation memory allocation failures | expand |
> +#ifdef ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE That seems like an odd constructu, as you don't call flush_kernel_dcache_page. From looking whoe defines it it seems to be about the right set of architectures, but that might be by a mix of chance and similar requirements for cache flushing. > +static void bio_invalidate_vmalloc_pages(struct bio *bio) > +{ > + if (bio->bi_private) { > + struct bvec_iter_all iter_all; > + struct bio_vec *bvec; > + unsigned long len = 0; > + > + bio_for_each_segment_all(bvec, bio, iter_all) > + len += bvec->bv_len; > + invalidate_kernel_vmap_range(bio->bi_private, len); We control the bio here, so we can directly iterate over the segments instead of doing the fairly expensive bio_for_each_segment_all call that goes to each page and builds a bvec for it. > + struct page *page; > int offset, i; > struct bio *bio; > > @@ -1508,6 +1529,12 @@ struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len, > if (!bio) > return ERR_PTR(-ENOMEM); > > + if (is_vmalloc) { > + flush_kernel_vmap_range(data, len); > + if ((!op_is_write(bio_op(bio)))) > + bio->bi_private = data; > + } We've just allocate the bio, so bio->bi_opf is not actually set at this point unfortunately.
On Thu, Jun 27, 2019 at 11:49:08AM +0900, Damien Le Moal wrote: > To allow the SCSI subsystem scsi_execute_req() function to issue > requests using large buffers that are better allocated with vmalloc() > rather than kmalloc(), modify bio_map_kern() and bio_copy_kern() to > allow passing a buffer allocated with vmalloc(). To do so, detect > vmalloc-ed buffers using is_vmalloc_addr(). For vmalloc-ed buffers, > flush the buffer using flush_kernel_vmap_range(), use vmalloc_to_page() > instead of virt_to_page() to obtain the pages of the buffer, and > invalidate the buffer addresses with invalidate_kernel_vmap_range() on > completion of read BIOs. This last point is executed using the function > bio_invalidate_vmalloc_pages() which is defined only if the > architecture defines ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE, that is, if the > architecture actually needs the invalidation done. > > Fixes: 515ce6061312 ("scsi: sd_zbc: Fix sd_zbc_report_zones() buffer allocation") > Fixes: e76239a3748c ("block: add a report_zones method") > Cc: stable@vger.kernel.org > Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com> > --- > block/bio.c | 43 ++++++++++++++++++++++++++++++++++++++++++- > 1 file changed, 42 insertions(+), 1 deletion(-) > > diff --git a/block/bio.c b/block/bio.c > index ce797d73bb43..1c21d1e7f1b8 100644 > --- a/block/bio.c > +++ b/block/bio.c > @@ -16,6 +16,7 @@ > #include <linux/workqueue.h> > #include <linux/cgroup.h> > #include <linux/blk-cgroup.h> > +#include <linux/highmem.h> > > #include <trace/events/block.h> > #include "blk.h" > @@ -1479,8 +1480,26 @@ void bio_unmap_user(struct bio *bio) > bio_put(bio); > } > > +#ifdef ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE > +static void bio_invalidate_vmalloc_pages(struct bio *bio) > +{ > + if (bio->bi_private) { > + struct bvec_iter_all iter_all; > + struct bio_vec *bvec; > + unsigned long len = 0; > + > + bio_for_each_segment_all(bvec, bio, iter_all) > + len += bvec->bv_len; > + invalidate_kernel_vmap_range(bio->bi_private, len); > + } > +} > +#else > +static void bio_invalidate_vmalloc_pages(struct bio *bio) {} > +#endif > + > static void bio_map_kern_endio(struct bio *bio) > { > + bio_invalidate_vmalloc_pages(bio); > bio_put(bio); > } > > @@ -1501,6 +1520,8 @@ struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len, > unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; > unsigned long start = kaddr >> PAGE_SHIFT; > const int nr_pages = end - start; > + bool is_vmalloc = is_vmalloc_addr(data); > + struct page *page; > int offset, i; > struct bio *bio; > > @@ -1508,6 +1529,12 @@ struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len, > if (!bio) > return ERR_PTR(-ENOMEM); > > + if (is_vmalloc) { > + flush_kernel_vmap_range(data, len); > + if ((!op_is_write(bio_op(bio)))) > + bio->bi_private = data; > + } > + > offset = offset_in_page(kaddr); > for (i = 0; i < nr_pages; i++) { > unsigned int bytes = PAGE_SIZE - offset; > @@ -1518,7 +1545,11 @@ struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len, > if (bytes > len) > bytes = len; > > - if (bio_add_pc_page(q, bio, virt_to_page(data), bytes, > + if (!is_vmalloc) > + page = virt_to_page(data); > + else > + page = vmalloc_to_page(data); > + if (bio_add_pc_page(q, bio, page, bytes, > offset) < bytes) { > /* we don't support partial mappings */ > bio_put(bio); > @@ -1531,6 +1562,7 @@ struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len, > } > > bio->bi_end_io = bio_map_kern_endio; > + > return bio; > } > EXPORT_SYMBOL(bio_map_kern); > @@ -1543,6 +1575,7 @@ static void bio_copy_kern_endio(struct bio *bio) > > static void bio_copy_kern_endio_read(struct bio *bio) > { > + unsigned long len = 0; > char *p = bio->bi_private; > struct bio_vec *bvec; > struct bvec_iter_all iter_all; > @@ -1550,8 +1583,12 @@ static void bio_copy_kern_endio_read(struct bio *bio) > bio_for_each_segment_all(bvec, bio, iter_all) { > memcpy(p, page_address(bvec->bv_page), bvec->bv_len); > p += bvec->bv_len; > + len += bvec->bv_len; > } > > + if (is_vmalloc_addr(bio->bi_private)) > + invalidate_kernel_vmap_range(bio->bi_private, len); > + > bio_copy_kern_endio(bio); > } > > @@ -1572,6 +1609,7 @@ struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len, > unsigned long kaddr = (unsigned long)data; > unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; > unsigned long start = kaddr >> PAGE_SHIFT; > + bool is_vmalloc = is_vmalloc_addr(data); > struct bio *bio; > void *p = data; > int nr_pages = 0; > @@ -1587,6 +1625,9 @@ struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len, > if (!bio) > return ERR_PTR(-ENOMEM); > > + if (is_vmalloc) > + flush_kernel_vmap_range(data, len); > + Are your sure that invalidate[|flush]_kernel_vmap_range is needed for bio_copy_kernel? The vmalloc buffer isn't involved in IO, and only accessed by CPU. Thanks, Ming
Christoph, On 2019/06/27 16:28, Christoph Hellwig wrote: >> +#ifdef ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE > > That seems like an odd constructu, as you don't call > flush_kernel_dcache_page. From looking whoe defines it it seems > to be about the right set of architectures, but that might be > by a mix of chance and similar requirements for cache flushing. This comes from include/linux/highmem.h: #ifndef ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE static inline void flush_kernel_dcache_page(struct page *page) { } static inline void flush_kernel_vmap_range(void *vaddr, int size) { } static inline void invalidate_kernel_vmap_range(void *vaddr, int size) { } #endif which I guessed is for the architectures that do not need the flush/invalidate vmap functions. I copied. Is there a better way ? The point was to avoid doing the loop on the bvec for the range length on architectures that have an empty definition of invalidate_kernel_vmap_range(). > >> +static void bio_invalidate_vmalloc_pages(struct bio *bio) >> +{ >> + if (bio->bi_private) { >> + struct bvec_iter_all iter_all; >> + struct bio_vec *bvec; >> + unsigned long len = 0; >> + >> + bio_for_each_segment_all(bvec, bio, iter_all) >> + len += bvec->bv_len; >> + invalidate_kernel_vmap_range(bio->bi_private, len); > > We control the bio here, so we can directly iterate over the > segments instead of doing the fairly expensive bio_for_each_segment_all > call that goes to each page and builds a bvec for it. OK. Got it. Will update it. > >> + struct page *page; >> int offset, i; >> struct bio *bio; >> >> @@ -1508,6 +1529,12 @@ struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len, >> if (!bio) >> return ERR_PTR(-ENOMEM); >> >> + if (is_vmalloc) { >> + flush_kernel_vmap_range(data, len); >> + if ((!op_is_write(bio_op(bio)))) >> + bio->bi_private = data; >> + } > > We've just allocate the bio, so bio->bi_opf is not actually set at > this point unfortunately. OK. I will move the check to the completion path then. Thanks !
Hi Ming, On 2019/06/27 16:47, Ming Lei wrote: [...] >> static void bio_copy_kern_endio_read(struct bio *bio) >> { >> + unsigned long len = 0; >> char *p = bio->bi_private; >> struct bio_vec *bvec; >> struct bvec_iter_all iter_all; >> @@ -1550,8 +1583,12 @@ static void bio_copy_kern_endio_read(struct bio *bio) >> bio_for_each_segment_all(bvec, bio, iter_all) { >> memcpy(p, page_address(bvec->bv_page), bvec->bv_len); >> p += bvec->bv_len; >> + len += bvec->bv_len; >> } >> >> + if (is_vmalloc_addr(bio->bi_private)) >> + invalidate_kernel_vmap_range(bio->bi_private, len); >> + >> bio_copy_kern_endio(bio); >> } >> >> @@ -1572,6 +1609,7 @@ struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len, >> unsigned long kaddr = (unsigned long)data; >> unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; >> unsigned long start = kaddr >> PAGE_SHIFT; >> + bool is_vmalloc = is_vmalloc_addr(data); >> struct bio *bio; >> void *p = data; >> int nr_pages = 0; >> @@ -1587,6 +1625,9 @@ struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len, >> if (!bio) >> return ERR_PTR(-ENOMEM); >> >> + if (is_vmalloc) >> + flush_kernel_vmap_range(data, len); >> + > > Are your sure that invalidate[|flush]_kernel_vmap_range is needed for > bio_copy_kernel? The vmalloc buffer isn't involved in IO, and only > accessed by CPU. Not sure at all. I am a little out of my league here. Christoph mentioned that this is necessary. Christoph, can you confirm ? Thanks !
On Thu, Jun 27, 2019 at 08:17:08AM +0000, Damien Le Moal wrote: > > Are your sure that invalidate[|flush]_kernel_vmap_range is needed for > > bio_copy_kernel? The vmalloc buffer isn't involved in IO, and only > > accessed by CPU. > > Not sure at all. I am a little out of my league here. > Christoph mentioned that this is necessary. > > Christoph, can you confirm ? Ming is right. Sorry for misleading you.
On Thu, Jun 27, 2019 at 08:14:56AM +0000, Damien Le Moal wrote: > which I guessed is for the architectures that do not need the flush/invalidate > vmap functions. I copied. Is there a better way ? The point was to avoid doing > the loop on the bvec for the range length on architectures that have an empty > definition of invalidate_kernel_vmap_range(). No, looks like what you did is right. I blame my lack of attention on the heat wave here and the resulting lack of sleep..
On 2019/06/27 17:25, Christoph Hellwig wrote: > On Thu, Jun 27, 2019 at 08:17:08AM +0000, Damien Le Moal wrote: >>> Are your sure that invalidate[|flush]_kernel_vmap_range is needed for >>> bio_copy_kernel? The vmalloc buffer isn't involved in IO, and only >>> accessed by CPU. >> >> Not sure at all. I am a little out of my league here. >> Christoph mentioned that this is necessary. >> >> Christoph, can you confirm ? > > Ming is right. Sorry for misleading you. No problem. Sending a V5 with cleanups. Thanks !
On 2019/06/27 17:26, Christoph Hellwig wrote: > On Thu, Jun 27, 2019 at 08:14:56AM +0000, Damien Le Moal wrote: >> which I guessed is for the architectures that do not need the flush/invalidate >> vmap functions. I copied. Is there a better way ? The point was to avoid doing >> the loop on the bvec for the range length on architectures that have an empty >> definition of invalidate_kernel_vmap_range(). > > No, looks like what you did is right. I blame my lack of attention > on the heat wave here and the resulting lack of sleep.. No problem ! At least we checked twice now !
diff --git a/block/bio.c b/block/bio.c index ce797d73bb43..1c21d1e7f1b8 100644 --- a/block/bio.c +++ b/block/bio.c @@ -16,6 +16,7 @@ #include <linux/workqueue.h> #include <linux/cgroup.h> #include <linux/blk-cgroup.h> +#include <linux/highmem.h> #include <trace/events/block.h> #include "blk.h" @@ -1479,8 +1480,26 @@ void bio_unmap_user(struct bio *bio) bio_put(bio); } +#ifdef ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE +static void bio_invalidate_vmalloc_pages(struct bio *bio) +{ + if (bio->bi_private) { + struct bvec_iter_all iter_all; + struct bio_vec *bvec; + unsigned long len = 0; + + bio_for_each_segment_all(bvec, bio, iter_all) + len += bvec->bv_len; + invalidate_kernel_vmap_range(bio->bi_private, len); + } +} +#else +static void bio_invalidate_vmalloc_pages(struct bio *bio) {} +#endif + static void bio_map_kern_endio(struct bio *bio) { + bio_invalidate_vmalloc_pages(bio); bio_put(bio); } @@ -1501,6 +1520,8 @@ struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len, unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; unsigned long start = kaddr >> PAGE_SHIFT; const int nr_pages = end - start; + bool is_vmalloc = is_vmalloc_addr(data); + struct page *page; int offset, i; struct bio *bio; @@ -1508,6 +1529,12 @@ struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len, if (!bio) return ERR_PTR(-ENOMEM); + if (is_vmalloc) { + flush_kernel_vmap_range(data, len); + if ((!op_is_write(bio_op(bio)))) + bio->bi_private = data; + } + offset = offset_in_page(kaddr); for (i = 0; i < nr_pages; i++) { unsigned int bytes = PAGE_SIZE - offset; @@ -1518,7 +1545,11 @@ struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len, if (bytes > len) bytes = len; - if (bio_add_pc_page(q, bio, virt_to_page(data), bytes, + if (!is_vmalloc) + page = virt_to_page(data); + else + page = vmalloc_to_page(data); + if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes) { /* we don't support partial mappings */ bio_put(bio); @@ -1531,6 +1562,7 @@ struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len, } bio->bi_end_io = bio_map_kern_endio; + return bio; } EXPORT_SYMBOL(bio_map_kern); @@ -1543,6 +1575,7 @@ static void bio_copy_kern_endio(struct bio *bio) static void bio_copy_kern_endio_read(struct bio *bio) { + unsigned long len = 0; char *p = bio->bi_private; struct bio_vec *bvec; struct bvec_iter_all iter_all; @@ -1550,8 +1583,12 @@ static void bio_copy_kern_endio_read(struct bio *bio) bio_for_each_segment_all(bvec, bio, iter_all) { memcpy(p, page_address(bvec->bv_page), bvec->bv_len); p += bvec->bv_len; + len += bvec->bv_len; } + if (is_vmalloc_addr(bio->bi_private)) + invalidate_kernel_vmap_range(bio->bi_private, len); + bio_copy_kern_endio(bio); } @@ -1572,6 +1609,7 @@ struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len, unsigned long kaddr = (unsigned long)data; unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; unsigned long start = kaddr >> PAGE_SHIFT; + bool is_vmalloc = is_vmalloc_addr(data); struct bio *bio; void *p = data; int nr_pages = 0; @@ -1587,6 +1625,9 @@ struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len, if (!bio) return ERR_PTR(-ENOMEM); + if (is_vmalloc) + flush_kernel_vmap_range(data, len); + while (len) { struct page *page; unsigned int bytes = PAGE_SIZE;
To allow the SCSI subsystem scsi_execute_req() function to issue requests using large buffers that are better allocated with vmalloc() rather than kmalloc(), modify bio_map_kern() and bio_copy_kern() to allow passing a buffer allocated with vmalloc(). To do so, detect vmalloc-ed buffers using is_vmalloc_addr(). For vmalloc-ed buffers, flush the buffer using flush_kernel_vmap_range(), use vmalloc_to_page() instead of virt_to_page() to obtain the pages of the buffer, and invalidate the buffer addresses with invalidate_kernel_vmap_range() on completion of read BIOs. This last point is executed using the function bio_invalidate_vmalloc_pages() which is defined only if the architecture defines ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE, that is, if the architecture actually needs the invalidation done. Fixes: 515ce6061312 ("scsi: sd_zbc: Fix sd_zbc_report_zones() buffer allocation") Fixes: e76239a3748c ("block: add a report_zones method") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com> --- block/bio.c | 43 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-)