Message ID | 34d44537a65aba6ede215a8ad882aeee028b423a.1730037261.git.leon@kernel.org (mailing list archive) |
---|---|
State | Not Applicable |
Headers | show |
Series | Block and NMMe PCI use of new DMA mapping API | expand |
On 2024-10-27 08:21, Leon Romanovsky wrote: > From: Christoph Hellwig <hch@lst.de> > > To get out of the dma mapping helpers having to check every segment for > it's P2P status, ensure that bios either contain P2P transfers or non-P2P > transfers, and that a P2P bio only contains ranges from a single device. > > This means we do the page zone access in the bio add path where it should > be still page hot, and will only have do the fairly expensive P2P topology > lookup once per bio down in the dma mapping path, and only for already > marked bios. > > Signed-off-by: Christoph Hellwig <hch@lst.de> > Signed-off-by: Leon Romanovsky <leonro@nvidia.com> Looks good to me. Reviewed-by: Logan Gunthorpe <logang@deltatee.com>
On 10/27/24 7:21 AM, Leon Romanovsky wrote: > + /* > + * When doing ZONE_DEVICE-based P2P transfers, all pages in a > + * bio must be P2P pages from the same device. > + */ > + if ((bio->bi_opf & REQ_P2PDMA) && > + !zone_device_pages_have_same_pgmap(bv->bv_page, page)) > + return 0; It's probably too late to change the "zone_device_" prefix into something that cannot be confused with a reference to zoned block devices? Thanks, Bart.
On Thu, Oct 31, 2024 at 01:58:37PM -0700, Bart Van Assche wrote: > On 10/27/24 7:21 AM, Leon Romanovsky wrote: > > + /* > > + * When doing ZONE_DEVICE-based P2P transfers, all pages in a > > + * bio must be P2P pages from the same device. > > + */ > > + if ((bio->bi_opf & REQ_P2PDMA) && > > + !zone_device_pages_have_same_pgmap(bv->bv_page, page)) > > + return 0; > > It's probably too late to change the "zone_device_" prefix into > something that cannot be confused with a reference to zoned block > devices? It is never too late to send a patch which renames the names, but it needs to be worth it. However it is hard to see global benefit from zone_device_* rename. ZONE_DEVICE is a well known term in the kernel and it is used all other places in the kernel. Thanks > > Thanks, > > Bart. >
在 2024/10/27 15:21, Leon Romanovsky 写道: > From: Christoph Hellwig <hch@lst.de> > > To get out of the dma mapping helpers having to check every segment for > it's P2P status, ensure that bios either contain P2P transfers or non-P2P > transfers, and that a P2P bio only contains ranges from a single device. > > This means we do the page zone access in the bio add path where it should > be still page hot, and will only have do the fairly expensive P2P topology > lookup once per bio down in the dma mapping path, and only for already > marked bios. > > Signed-off-by: Christoph Hellwig <hch@lst.de> > Signed-off-by: Leon Romanovsky <leonro@nvidia.com> > --- > block/bio.c | 36 +++++++++++++++++++++++++++++------- > block/blk-map.c | 32 ++++++++++++++++++++++++-------- > include/linux/blk_types.h | 2 ++ > 3 files changed, 55 insertions(+), 15 deletions(-) > > diff --git a/block/bio.c b/block/bio.c > index 2d3bc8bfb071..943a6d78cb3e 100644 > --- a/block/bio.c > +++ b/block/bio.c > @@ -928,8 +928,6 @@ static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page, > return false; > if (xen_domain() && !xen_biovec_phys_mergeable(bv, page)) > return false; > - if (!zone_device_pages_have_same_pgmap(bv->bv_page, page)) > - return false; > > *same_page = ((vec_end_addr & PAGE_MASK) == ((page_addr + off) & > PAGE_MASK)); > @@ -993,6 +991,14 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio, > if (bio->bi_vcnt > 0) { > struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; > > + /* > + * When doing ZONE_DEVICE-based P2P transfers, all pages in a > + * bio must be P2P pages from the same device. > + */ > + if ((bio->bi_opf & REQ_P2PDMA) && > + !zone_device_pages_have_same_pgmap(bv->bv_page, page)) > + return 0; > + > if (bvec_try_merge_hw_page(q, bv, page, len, offset, > same_page)) { > bio->bi_iter.bi_size += len; > @@ -1009,6 +1015,9 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio, > */ > if (bvec_gap_to_prev(&q->limits, bv, offset)) > return 0; > + } else { > + if (is_pci_p2pdma_page(page)) > + bio->bi_opf |= REQ_P2PDMA | REQ_NOMERGE; > } > > bvec_set_page(&bio->bi_io_vec[bio->bi_vcnt], page, len, offset); > @@ -1133,11 +1142,24 @@ static int bio_add_page_int(struct bio *bio, struct page *page, > if (bio->bi_iter.bi_size > UINT_MAX - len) > return 0; > > - if (bio->bi_vcnt > 0 && > - bvec_try_merge_page(&bio->bi_io_vec[bio->bi_vcnt - 1], > - page, len, offset, same_page)) { > - bio->bi_iter.bi_size += len; > - return len; > + if (bio->bi_vcnt > 0) { > + struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; > + > + /* > + * When doing ZONE_DEVICE-based P2P transfers, all pages in a > + * bio must be P2P pages from the same device. > + */ > + if ((bio->bi_opf & REQ_P2PDMA) && > + !zone_device_pages_have_same_pgmap(bv->bv_page, page)) > + return 0; > + > + if (bvec_try_merge_page(bv, page, len, offset, same_page)) { > + bio->bi_iter.bi_size += len; > + return len; > + } > + } else { > + if (is_pci_p2pdma_page(page)) > + bio->bi_opf |= REQ_P2PDMA | REQ_NOMERGE; > } > > if (bio->bi_vcnt >= bio->bi_max_vecs) > diff --git a/block/blk-map.c b/block/blk-map.c > index 0e1167b23934..03192b1ca6ea 100644 > --- a/block/blk-map.c > +++ b/block/blk-map.c > @@ -568,6 +568,7 @@ static int blk_rq_map_user_bvec(struct request *rq, const struct iov_iter *iter) > const struct queue_limits *lim = &q->limits; > unsigned int nsegs = 0, bytes = 0; > struct bio *bio; > + int error; > size_t i; > > if (!nr_iter || (nr_iter >> SECTOR_SHIFT) > queue_max_hw_sectors(q)) > @@ -588,15 +589,30 @@ static int blk_rq_map_user_bvec(struct request *rq, const struct iov_iter *iter) > for (i = 0; i < nr_segs; i++) { > struct bio_vec *bv = &bvecs[i]; > > - /* > - * If the queue doesn't support SG gaps and adding this > - * offset would create a gap, fallback to copy. > - */ > - if (bvprvp && bvec_gap_to_prev(lim, bvprvp, bv->bv_offset)) { > - blk_mq_map_bio_put(bio); > - return -EREMOTEIO; > + error = -EREMOTEIO; > + if (bvprvp) { > + /* > + * If the queue doesn't support SG gaps and adding this > + * offset would create a gap, fallback to copy. > + */ > + if (bvec_gap_to_prev(lim, bvprvp, bv->bv_offset)) > + goto put_bio; > + > + /* > + * When doing ZONE_DEVICE-based P2P transfers, all pages > + * in a bio must be P2P pages, and from the same device. > + */ > + if ((bio->bi_opf & REQ_P2PDMA) && > + zone_device_pages_have_same_pgmap(bvprvp->bv_page, > + bv->bv_page)) > + goto put_bio; > + } else { > + if (is_pci_p2pdma_page(bv->bv_page)) > + bio->bi_opf |= REQ_P2PDMA | REQ_NOMERGE; > } > + > /* check full condition */ > + error = -EINVAL; > if (nsegs >= nr_segs || bytes > UINT_MAX - bv->bv_len) > goto put_bio; > if (bytes + bv->bv_len > nr_iter) > @@ -611,7 +627,7 @@ static int blk_rq_map_user_bvec(struct request *rq, const struct iov_iter *iter) > return 0; > put_bio: > blk_mq_map_bio_put(bio); > - return -EINVAL; > + return error; > } > > /** > diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h > index dce7615c35e7..94cf146e8ce6 100644 > --- a/include/linux/blk_types.h > +++ b/include/linux/blk_types.h > @@ -378,6 +378,7 @@ enum req_flag_bits { > __REQ_DRV, /* for driver use */ > __REQ_FS_PRIVATE, /* for file system (submitter) use */ > __REQ_ATOMIC, /* for atomic write operations */ > + __REQ_P2PDMA, /* contains P2P DMA pages */ > /* > * Command specific flags, keep last: > */ > @@ -410,6 +411,7 @@ enum req_flag_bits { > #define REQ_DRV (__force blk_opf_t)(1ULL << __REQ_DRV) > #define REQ_FS_PRIVATE (__force blk_opf_t)(1ULL << __REQ_FS_PRIVATE) > #define REQ_ATOMIC (__force blk_opf_t)(1ULL << __REQ_ATOMIC) > +#define REQ_P2PDMA (__force blk_opf_t)(1ULL << __REQ_P2PDMA) #define REQ_P2PDMA (__force blk_opf_t)BIT_ULL(__REQ_P2PDMA) Use BIT_ULL instead of direct left shit. Zhu Yanjun > > #define REQ_NOUNMAP (__force blk_opf_t)(1ULL << __REQ_NOUNMAP) >
On Sat, Nov 02, 2024 at 08:39:35AM +0100, Zhu Yanjun wrote: > 在 2024/10/27 15:21, Leon Romanovsky 写道: > > From: Christoph Hellwig <hch@lst.de> > > > > To get out of the dma mapping helpers having to check every segment for > > it's P2P status, ensure that bios either contain P2P transfers or non-P2P > > transfers, and that a P2P bio only contains ranges from a single device. > > > > This means we do the page zone access in the bio add path where it should > > be still page hot, and will only have do the fairly expensive P2P topology > > lookup once per bio down in the dma mapping path, and only for already > > marked bios. > > > > Signed-off-by: Christoph Hellwig <hch@lst.de> > > Signed-off-by: Leon Romanovsky <leonro@nvidia.com> > > --- > > block/bio.c | 36 +++++++++++++++++++++++++++++------- > > block/blk-map.c | 32 ++++++++++++++++++++++++-------- > > include/linux/blk_types.h | 2 ++ > > 3 files changed, 55 insertions(+), 15 deletions(-) <...> > > @@ -410,6 +411,7 @@ enum req_flag_bits { > > #define REQ_DRV (__force blk_opf_t)(1ULL << __REQ_DRV) > > #define REQ_FS_PRIVATE (__force blk_opf_t)(1ULL << __REQ_FS_PRIVATE) > > #define REQ_ATOMIC (__force blk_opf_t)(1ULL << __REQ_ATOMIC) > > +#define REQ_P2PDMA (__force blk_opf_t)(1ULL << __REQ_P2PDMA) > > #define REQ_P2PDMA (__force blk_opf_t)BIT_ULL(__REQ_P2PDMA) > > Use BIT_ULL instead of direct left shit. We keep coding style consistent and all defines above aren't implemented with BIT_ULL(). Thanks > > Zhu Yanjun > > > #define REQ_NOUNMAP (__force blk_opf_t)(1ULL << __REQ_NOUNMAP) >
On Thu, Oct 31, 2024 at 01:58:37PM -0700, Bart Van Assche wrote: > On 10/27/24 7:21 AM, Leon Romanovsky wrote: >> + /* >> + * When doing ZONE_DEVICE-based P2P transfers, all pages in a >> + * bio must be P2P pages from the same device. >> + */ >> + if ((bio->bi_opf & REQ_P2PDMA) && >> + !zone_device_pages_have_same_pgmap(bv->bv_page, page)) >> + return 0; > > It's probably too late to change the "zone_device_" prefix into > something that cannot be confused with a reference to zoned block > devices? It's too late and also wrong and also entirely out of scope for this series.
On Sat, Nov 02, 2024 at 08:39:35AM +0100, Zhu Yanjun wrote <full quote delete> If you have something useful to say, please quote the mail you are replying to to the context required to make sense. Ignored for now.
diff --git a/block/bio.c b/block/bio.c index 2d3bc8bfb071..943a6d78cb3e 100644 --- a/block/bio.c +++ b/block/bio.c @@ -928,8 +928,6 @@ static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page, return false; if (xen_domain() && !xen_biovec_phys_mergeable(bv, page)) return false; - if (!zone_device_pages_have_same_pgmap(bv->bv_page, page)) - return false; *same_page = ((vec_end_addr & PAGE_MASK) == ((page_addr + off) & PAGE_MASK)); @@ -993,6 +991,14 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio, if (bio->bi_vcnt > 0) { struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; + /* + * When doing ZONE_DEVICE-based P2P transfers, all pages in a + * bio must be P2P pages from the same device. + */ + if ((bio->bi_opf & REQ_P2PDMA) && + !zone_device_pages_have_same_pgmap(bv->bv_page, page)) + return 0; + if (bvec_try_merge_hw_page(q, bv, page, len, offset, same_page)) { bio->bi_iter.bi_size += len; @@ -1009,6 +1015,9 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio, */ if (bvec_gap_to_prev(&q->limits, bv, offset)) return 0; + } else { + if (is_pci_p2pdma_page(page)) + bio->bi_opf |= REQ_P2PDMA | REQ_NOMERGE; } bvec_set_page(&bio->bi_io_vec[bio->bi_vcnt], page, len, offset); @@ -1133,11 +1142,24 @@ static int bio_add_page_int(struct bio *bio, struct page *page, if (bio->bi_iter.bi_size > UINT_MAX - len) return 0; - if (bio->bi_vcnt > 0 && - bvec_try_merge_page(&bio->bi_io_vec[bio->bi_vcnt - 1], - page, len, offset, same_page)) { - bio->bi_iter.bi_size += len; - return len; + if (bio->bi_vcnt > 0) { + struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; + + /* + * When doing ZONE_DEVICE-based P2P transfers, all pages in a + * bio must be P2P pages from the same device. + */ + if ((bio->bi_opf & REQ_P2PDMA) && + !zone_device_pages_have_same_pgmap(bv->bv_page, page)) + return 0; + + if (bvec_try_merge_page(bv, page, len, offset, same_page)) { + bio->bi_iter.bi_size += len; + return len; + } + } else { + if (is_pci_p2pdma_page(page)) + bio->bi_opf |= REQ_P2PDMA | REQ_NOMERGE; } if (bio->bi_vcnt >= bio->bi_max_vecs) diff --git a/block/blk-map.c b/block/blk-map.c index 0e1167b23934..03192b1ca6ea 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -568,6 +568,7 @@ static int blk_rq_map_user_bvec(struct request *rq, const struct iov_iter *iter) const struct queue_limits *lim = &q->limits; unsigned int nsegs = 0, bytes = 0; struct bio *bio; + int error; size_t i; if (!nr_iter || (nr_iter >> SECTOR_SHIFT) > queue_max_hw_sectors(q)) @@ -588,15 +589,30 @@ static int blk_rq_map_user_bvec(struct request *rq, const struct iov_iter *iter) for (i = 0; i < nr_segs; i++) { struct bio_vec *bv = &bvecs[i]; - /* - * If the queue doesn't support SG gaps and adding this - * offset would create a gap, fallback to copy. - */ - if (bvprvp && bvec_gap_to_prev(lim, bvprvp, bv->bv_offset)) { - blk_mq_map_bio_put(bio); - return -EREMOTEIO; + error = -EREMOTEIO; + if (bvprvp) { + /* + * If the queue doesn't support SG gaps and adding this + * offset would create a gap, fallback to copy. + */ + if (bvec_gap_to_prev(lim, bvprvp, bv->bv_offset)) + goto put_bio; + + /* + * When doing ZONE_DEVICE-based P2P transfers, all pages + * in a bio must be P2P pages, and from the same device. + */ + if ((bio->bi_opf & REQ_P2PDMA) && + zone_device_pages_have_same_pgmap(bvprvp->bv_page, + bv->bv_page)) + goto put_bio; + } else { + if (is_pci_p2pdma_page(bv->bv_page)) + bio->bi_opf |= REQ_P2PDMA | REQ_NOMERGE; } + /* check full condition */ + error = -EINVAL; if (nsegs >= nr_segs || bytes > UINT_MAX - bv->bv_len) goto put_bio; if (bytes + bv->bv_len > nr_iter) @@ -611,7 +627,7 @@ static int blk_rq_map_user_bvec(struct request *rq, const struct iov_iter *iter) return 0; put_bio: blk_mq_map_bio_put(bio); - return -EINVAL; + return error; } /** diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index dce7615c35e7..94cf146e8ce6 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -378,6 +378,7 @@ enum req_flag_bits { __REQ_DRV, /* for driver use */ __REQ_FS_PRIVATE, /* for file system (submitter) use */ __REQ_ATOMIC, /* for atomic write operations */ + __REQ_P2PDMA, /* contains P2P DMA pages */ /* * Command specific flags, keep last: */ @@ -410,6 +411,7 @@ enum req_flag_bits { #define REQ_DRV (__force blk_opf_t)(1ULL << __REQ_DRV) #define REQ_FS_PRIVATE (__force blk_opf_t)(1ULL << __REQ_FS_PRIVATE) #define REQ_ATOMIC (__force blk_opf_t)(1ULL << __REQ_ATOMIC) +#define REQ_P2PDMA (__force blk_opf_t)(1ULL << __REQ_P2PDMA) #define REQ_NOUNMAP (__force blk_opf_t)(1ULL << __REQ_NOUNMAP)