diff mbox series

[RFC,2/7] block: don't merge different kinds of P2P transfers in a single bio

Message ID 34d44537a65aba6ede215a8ad882aeee028b423a.1730037261.git.leon@kernel.org (mailing list archive)
State Handled Elsewhere
Headers show
Series Block and NMMe PCI use of new DMA mapping API | expand

Commit Message

Leon Romanovsky Oct. 27, 2024, 2:21 p.m. UTC
From: Christoph Hellwig <hch@lst.de>

To get out of the dma mapping helpers having to check every segment for
it's P2P status, ensure that bios either contain P2P transfers or non-P2P
transfers, and that a P2P bio only contains ranges from a single device.

This means we do the page zone access in the bio add path where it should
be still page hot, and will only have do the fairly expensive P2P topology
lookup once per bio down in the dma mapping path, and only for already
marked bios.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 block/bio.c               | 36 +++++++++++++++++++++++++++++-------
 block/blk-map.c           | 32 ++++++++++++++++++++++++--------
 include/linux/blk_types.h |  2 ++
 3 files changed, 55 insertions(+), 15 deletions(-)

Comments

Logan Gunthorpe Oct. 28, 2024, 6:27 p.m. UTC | #1
On 2024-10-27 08:21, Leon Romanovsky wrote:
> From: Christoph Hellwig <hch@lst.de>
> 
> To get out of the dma mapping helpers having to check every segment for
> it's P2P status, ensure that bios either contain P2P transfers or non-P2P
> transfers, and that a P2P bio only contains ranges from a single device.
> 
> This means we do the page zone access in the bio add path where it should
> be still page hot, and will only have do the fairly expensive P2P topology
> lookup once per bio down in the dma mapping path, and only for already
> marked bios.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> Signed-off-by: Leon Romanovsky <leonro@nvidia.com>

Looks good to me.

Reviewed-by: Logan Gunthorpe <logang@deltatee.com>
Bart Van Assche Oct. 31, 2024, 8:58 p.m. UTC | #2
On 10/27/24 7:21 AM, Leon Romanovsky wrote:
> +		/*
> +		 * When doing ZONE_DEVICE-based P2P transfers, all pages in a
> +		 * bio must be P2P pages from the same device.
> +		 */
> +		if ((bio->bi_opf & REQ_P2PDMA) &&
> +		    !zone_device_pages_have_same_pgmap(bv->bv_page, page))
> +			return 0;

It's probably too late to change the "zone_device_" prefix into
something that cannot be confused with a reference to zoned block
devices?

Thanks,

Bart.
Leon Romanovsky Nov. 1, 2024, 6:11 a.m. UTC | #3
On Thu, Oct 31, 2024 at 01:58:37PM -0700, Bart Van Assche wrote:
> On 10/27/24 7:21 AM, Leon Romanovsky wrote:
> > +		/*
> > +		 * When doing ZONE_DEVICE-based P2P transfers, all pages in a
> > +		 * bio must be P2P pages from the same device.
> > +		 */
> > +		if ((bio->bi_opf & REQ_P2PDMA) &&
> > +		    !zone_device_pages_have_same_pgmap(bv->bv_page, page))
> > +			return 0;
> 
> It's probably too late to change the "zone_device_" prefix into
> something that cannot be confused with a reference to zoned block
> devices?

It is never too late to send a patch which renames the names, but it needs
to be worth it. However it is hard to see global benefit from zone_device_* rename.
ZONE_DEVICE is a well known term in the kernel and it is used all other places
in the kernel.

Thanks

> 
> Thanks,
> 
> Bart.
>
Zhu Yanjun Nov. 2, 2024, 7:39 a.m. UTC | #4
在 2024/10/27 15:21, Leon Romanovsky 写道:
> From: Christoph Hellwig <hch@lst.de>
> 
> To get out of the dma mapping helpers having to check every segment for
> it's P2P status, ensure that bios either contain P2P transfers or non-P2P
> transfers, and that a P2P bio only contains ranges from a single device.
> 
> This means we do the page zone access in the bio add path where it should
> be still page hot, and will only have do the fairly expensive P2P topology
> lookup once per bio down in the dma mapping path, and only for already
> marked bios.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
> ---
>   block/bio.c               | 36 +++++++++++++++++++++++++++++-------
>   block/blk-map.c           | 32 ++++++++++++++++++++++++--------
>   include/linux/blk_types.h |  2 ++
>   3 files changed, 55 insertions(+), 15 deletions(-)
> 
> diff --git a/block/bio.c b/block/bio.c
> index 2d3bc8bfb071..943a6d78cb3e 100644
> --- a/block/bio.c
> +++ b/block/bio.c
> @@ -928,8 +928,6 @@ static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page,
>   		return false;
>   	if (xen_domain() && !xen_biovec_phys_mergeable(bv, page))
>   		return false;
> -	if (!zone_device_pages_have_same_pgmap(bv->bv_page, page))
> -		return false;
>   
>   	*same_page = ((vec_end_addr & PAGE_MASK) == ((page_addr + off) &
>   		     PAGE_MASK));
> @@ -993,6 +991,14 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio,
>   	if (bio->bi_vcnt > 0) {
>   		struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
>   
> +		/*
> +		 * When doing ZONE_DEVICE-based P2P transfers, all pages in a
> +		 * bio must be P2P pages from the same device.
> +		 */
> +		if ((bio->bi_opf & REQ_P2PDMA) &&
> +		    !zone_device_pages_have_same_pgmap(bv->bv_page, page))
> +			return 0;
> +
>   		if (bvec_try_merge_hw_page(q, bv, page, len, offset,
>   				same_page)) {
>   			bio->bi_iter.bi_size += len;
> @@ -1009,6 +1015,9 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio,
>   		 */
>   		if (bvec_gap_to_prev(&q->limits, bv, offset))
>   			return 0;
> +	} else {
> +		if (is_pci_p2pdma_page(page))
> +			bio->bi_opf |= REQ_P2PDMA | REQ_NOMERGE;
>   	}
>   
>   	bvec_set_page(&bio->bi_io_vec[bio->bi_vcnt], page, len, offset);
> @@ -1133,11 +1142,24 @@ static int bio_add_page_int(struct bio *bio, struct page *page,
>   	if (bio->bi_iter.bi_size > UINT_MAX - len)
>   		return 0;
>   
> -	if (bio->bi_vcnt > 0 &&
> -	    bvec_try_merge_page(&bio->bi_io_vec[bio->bi_vcnt - 1],
> -				page, len, offset, same_page)) {
> -		bio->bi_iter.bi_size += len;
> -		return len;
> +	if (bio->bi_vcnt > 0) {
> +		struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
> +
> +		/*
> +		 * When doing ZONE_DEVICE-based P2P transfers, all pages in a
> +		 * bio must be P2P pages from the same device.
> +		 */
> +		if ((bio->bi_opf & REQ_P2PDMA) &&
> +		    !zone_device_pages_have_same_pgmap(bv->bv_page, page))
> +			return 0;
> +
> +		if (bvec_try_merge_page(bv, page, len, offset, same_page)) {
> +			bio->bi_iter.bi_size += len;
> +			return len;
> +		}
> +	} else {
> +		if (is_pci_p2pdma_page(page))
> +			bio->bi_opf |= REQ_P2PDMA | REQ_NOMERGE;
>   	}
>   
>   	if (bio->bi_vcnt >= bio->bi_max_vecs)
> diff --git a/block/blk-map.c b/block/blk-map.c
> index 0e1167b23934..03192b1ca6ea 100644
> --- a/block/blk-map.c
> +++ b/block/blk-map.c
> @@ -568,6 +568,7 @@ static int blk_rq_map_user_bvec(struct request *rq, const struct iov_iter *iter)
>   	const struct queue_limits *lim = &q->limits;
>   	unsigned int nsegs = 0, bytes = 0;
>   	struct bio *bio;
> +	int error;
>   	size_t i;
>   
>   	if (!nr_iter || (nr_iter >> SECTOR_SHIFT) > queue_max_hw_sectors(q))
> @@ -588,15 +589,30 @@ static int blk_rq_map_user_bvec(struct request *rq, const struct iov_iter *iter)
>   	for (i = 0; i < nr_segs; i++) {
>   		struct bio_vec *bv = &bvecs[i];
>   
> -		/*
> -		 * If the queue doesn't support SG gaps and adding this
> -		 * offset would create a gap, fallback to copy.
> -		 */
> -		if (bvprvp && bvec_gap_to_prev(lim, bvprvp, bv->bv_offset)) {
> -			blk_mq_map_bio_put(bio);
> -			return -EREMOTEIO;
> +		error = -EREMOTEIO;
> +		if (bvprvp) {
> +			/*
> +			 * If the queue doesn't support SG gaps and adding this
> +			 * offset would create a gap, fallback to copy.
> +			 */
> +			if (bvec_gap_to_prev(lim, bvprvp, bv->bv_offset))
> +				goto put_bio;
> +
> +			/*
> +			 * When doing ZONE_DEVICE-based P2P transfers, all pages
> +			 * in a bio must be P2P pages, and from the same device.
> +			 */
> +			if ((bio->bi_opf & REQ_P2PDMA) &&
> +			    zone_device_pages_have_same_pgmap(bvprvp->bv_page,
> +					bv->bv_page))
> +				goto put_bio;
> +		} else {
> +			if (is_pci_p2pdma_page(bv->bv_page))
> +				bio->bi_opf |= REQ_P2PDMA | REQ_NOMERGE;
>   		}
> +
>   		/* check full condition */
> +		error = -EINVAL;
>   		if (nsegs >= nr_segs || bytes > UINT_MAX - bv->bv_len)
>   			goto put_bio;
>   		if (bytes + bv->bv_len > nr_iter)
> @@ -611,7 +627,7 @@ static int blk_rq_map_user_bvec(struct request *rq, const struct iov_iter *iter)
>   	return 0;
>   put_bio:
>   	blk_mq_map_bio_put(bio);
> -	return -EINVAL;
> +	return error;
>   }
>   
>   /**
> diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
> index dce7615c35e7..94cf146e8ce6 100644
> --- a/include/linux/blk_types.h
> +++ b/include/linux/blk_types.h
> @@ -378,6 +378,7 @@ enum req_flag_bits {
>   	__REQ_DRV,		/* for driver use */
>   	__REQ_FS_PRIVATE,	/* for file system (submitter) use */
>   	__REQ_ATOMIC,		/* for atomic write operations */
> +	__REQ_P2PDMA,		/* contains P2P DMA pages */
>   	/*
>   	 * Command specific flags, keep last:
>   	 */
> @@ -410,6 +411,7 @@ enum req_flag_bits {
>   #define REQ_DRV		(__force blk_opf_t)(1ULL << __REQ_DRV)
>   #define REQ_FS_PRIVATE	(__force blk_opf_t)(1ULL << __REQ_FS_PRIVATE)
>   #define REQ_ATOMIC	(__force blk_opf_t)(1ULL << __REQ_ATOMIC)
> +#define REQ_P2PDMA	(__force blk_opf_t)(1ULL << __REQ_P2PDMA)

#define REQ_P2PDMA	(__force blk_opf_t)BIT_ULL(__REQ_P2PDMA)

Use BIT_ULL instead of direct left shit.

Zhu Yanjun

>   
>   #define REQ_NOUNMAP	(__force blk_opf_t)(1ULL << __REQ_NOUNMAP)
>
Leon Romanovsky Nov. 3, 2024, 3:19 p.m. UTC | #5
On Sat, Nov 02, 2024 at 08:39:35AM +0100, Zhu Yanjun wrote:
> 在 2024/10/27 15:21, Leon Romanovsky 写道:
> > From: Christoph Hellwig <hch@lst.de>
> > 
> > To get out of the dma mapping helpers having to check every segment for
> > it's P2P status, ensure that bios either contain P2P transfers or non-P2P
> > transfers, and that a P2P bio only contains ranges from a single device.
> > 
> > This means we do the page zone access in the bio add path where it should
> > be still page hot, and will only have do the fairly expensive P2P topology
> > lookup once per bio down in the dma mapping path, and only for already
> > marked bios.
> > 
> > Signed-off-by: Christoph Hellwig <hch@lst.de>
> > Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
> > ---
> >   block/bio.c               | 36 +++++++++++++++++++++++++++++-------
> >   block/blk-map.c           | 32 ++++++++++++++++++++++++--------
> >   include/linux/blk_types.h |  2 ++
> >   3 files changed, 55 insertions(+), 15 deletions(-)

<...>

> > @@ -410,6 +411,7 @@ enum req_flag_bits {
> >   #define REQ_DRV		(__force blk_opf_t)(1ULL << __REQ_DRV)
> >   #define REQ_FS_PRIVATE	(__force blk_opf_t)(1ULL << __REQ_FS_PRIVATE)
> >   #define REQ_ATOMIC	(__force blk_opf_t)(1ULL << __REQ_ATOMIC)
> > +#define REQ_P2PDMA	(__force blk_opf_t)(1ULL << __REQ_P2PDMA)
> 
> #define REQ_P2PDMA	(__force blk_opf_t)BIT_ULL(__REQ_P2PDMA)
> 
> Use BIT_ULL instead of direct left shit.

We keep coding style consistent and all defines above aren't implemented
with BIT_ULL().

Thanks

> 
> Zhu Yanjun
> 
> >   #define REQ_NOUNMAP	(__force blk_opf_t)(1ULL << __REQ_NOUNMAP)
>
Christoph Hellwig Nov. 4, 2024, 8:55 a.m. UTC | #6
On Thu, Oct 31, 2024 at 01:58:37PM -0700, Bart Van Assche wrote:
> On 10/27/24 7:21 AM, Leon Romanovsky wrote:
>> +		/*
>> +		 * When doing ZONE_DEVICE-based P2P transfers, all pages in a
>> +		 * bio must be P2P pages from the same device.
>> +		 */
>> +		if ((bio->bi_opf & REQ_P2PDMA) &&
>> +		    !zone_device_pages_have_same_pgmap(bv->bv_page, page))
>> +			return 0;
>
> It's probably too late to change the "zone_device_" prefix into
> something that cannot be confused with a reference to zoned block
> devices?

It's too late and also wrong and also entirely out of scope for this
series.
Christoph Hellwig Nov. 4, 2024, 8:56 a.m. UTC | #7
On Sat, Nov 02, 2024 at 08:39:35AM +0100, Zhu Yanjun wrote

<full quote delete>

If you have something useful to say, please quote the mail you are
replying to to the context required to make sense.

Ignored for now.
diff mbox series

Patch

diff --git a/block/bio.c b/block/bio.c
index 2d3bc8bfb071..943a6d78cb3e 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -928,8 +928,6 @@  static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page,
 		return false;
 	if (xen_domain() && !xen_biovec_phys_mergeable(bv, page))
 		return false;
-	if (!zone_device_pages_have_same_pgmap(bv->bv_page, page))
-		return false;
 
 	*same_page = ((vec_end_addr & PAGE_MASK) == ((page_addr + off) &
 		     PAGE_MASK));
@@ -993,6 +991,14 @@  int bio_add_hw_page(struct request_queue *q, struct bio *bio,
 	if (bio->bi_vcnt > 0) {
 		struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
 
+		/*
+		 * When doing ZONE_DEVICE-based P2P transfers, all pages in a
+		 * bio must be P2P pages from the same device.
+		 */
+		if ((bio->bi_opf & REQ_P2PDMA) &&
+		    !zone_device_pages_have_same_pgmap(bv->bv_page, page))
+			return 0;
+
 		if (bvec_try_merge_hw_page(q, bv, page, len, offset,
 				same_page)) {
 			bio->bi_iter.bi_size += len;
@@ -1009,6 +1015,9 @@  int bio_add_hw_page(struct request_queue *q, struct bio *bio,
 		 */
 		if (bvec_gap_to_prev(&q->limits, bv, offset))
 			return 0;
+	} else {
+		if (is_pci_p2pdma_page(page))
+			bio->bi_opf |= REQ_P2PDMA | REQ_NOMERGE;
 	}
 
 	bvec_set_page(&bio->bi_io_vec[bio->bi_vcnt], page, len, offset);
@@ -1133,11 +1142,24 @@  static int bio_add_page_int(struct bio *bio, struct page *page,
 	if (bio->bi_iter.bi_size > UINT_MAX - len)
 		return 0;
 
-	if (bio->bi_vcnt > 0 &&
-	    bvec_try_merge_page(&bio->bi_io_vec[bio->bi_vcnt - 1],
-				page, len, offset, same_page)) {
-		bio->bi_iter.bi_size += len;
-		return len;
+	if (bio->bi_vcnt > 0) {
+		struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
+
+		/*
+		 * When doing ZONE_DEVICE-based P2P transfers, all pages in a
+		 * bio must be P2P pages from the same device.
+		 */
+		if ((bio->bi_opf & REQ_P2PDMA) &&
+		    !zone_device_pages_have_same_pgmap(bv->bv_page, page))
+			return 0;
+
+		if (bvec_try_merge_page(bv, page, len, offset, same_page)) {
+			bio->bi_iter.bi_size += len;
+			return len;
+		}
+	} else {
+		if (is_pci_p2pdma_page(page))
+			bio->bi_opf |= REQ_P2PDMA | REQ_NOMERGE;
 	}
 
 	if (bio->bi_vcnt >= bio->bi_max_vecs)
diff --git a/block/blk-map.c b/block/blk-map.c
index 0e1167b23934..03192b1ca6ea 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -568,6 +568,7 @@  static int blk_rq_map_user_bvec(struct request *rq, const struct iov_iter *iter)
 	const struct queue_limits *lim = &q->limits;
 	unsigned int nsegs = 0, bytes = 0;
 	struct bio *bio;
+	int error;
 	size_t i;
 
 	if (!nr_iter || (nr_iter >> SECTOR_SHIFT) > queue_max_hw_sectors(q))
@@ -588,15 +589,30 @@  static int blk_rq_map_user_bvec(struct request *rq, const struct iov_iter *iter)
 	for (i = 0; i < nr_segs; i++) {
 		struct bio_vec *bv = &bvecs[i];
 
-		/*
-		 * If the queue doesn't support SG gaps and adding this
-		 * offset would create a gap, fallback to copy.
-		 */
-		if (bvprvp && bvec_gap_to_prev(lim, bvprvp, bv->bv_offset)) {
-			blk_mq_map_bio_put(bio);
-			return -EREMOTEIO;
+		error = -EREMOTEIO;
+		if (bvprvp) {
+			/*
+			 * If the queue doesn't support SG gaps and adding this
+			 * offset would create a gap, fallback to copy.
+			 */
+			if (bvec_gap_to_prev(lim, bvprvp, bv->bv_offset))
+				goto put_bio;
+
+			/*
+			 * When doing ZONE_DEVICE-based P2P transfers, all pages
+			 * in a bio must be P2P pages, and from the same device.
+			 */
+			if ((bio->bi_opf & REQ_P2PDMA) &&
+			    zone_device_pages_have_same_pgmap(bvprvp->bv_page,
+					bv->bv_page))
+				goto put_bio;
+		} else {
+			if (is_pci_p2pdma_page(bv->bv_page))
+				bio->bi_opf |= REQ_P2PDMA | REQ_NOMERGE;
 		}
+
 		/* check full condition */
+		error = -EINVAL;
 		if (nsegs >= nr_segs || bytes > UINT_MAX - bv->bv_len)
 			goto put_bio;
 		if (bytes + bv->bv_len > nr_iter)
@@ -611,7 +627,7 @@  static int blk_rq_map_user_bvec(struct request *rq, const struct iov_iter *iter)
 	return 0;
 put_bio:
 	blk_mq_map_bio_put(bio);
-	return -EINVAL;
+	return error;
 }
 
 /**
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index dce7615c35e7..94cf146e8ce6 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -378,6 +378,7 @@  enum req_flag_bits {
 	__REQ_DRV,		/* for driver use */
 	__REQ_FS_PRIVATE,	/* for file system (submitter) use */
 	__REQ_ATOMIC,		/* for atomic write operations */
+	__REQ_P2PDMA,		/* contains P2P DMA pages */
 	/*
 	 * Command specific flags, keep last:
 	 */
@@ -410,6 +411,7 @@  enum req_flag_bits {
 #define REQ_DRV		(__force blk_opf_t)(1ULL << __REQ_DRV)
 #define REQ_FS_PRIVATE	(__force blk_opf_t)(1ULL << __REQ_FS_PRIVATE)
 #define REQ_ATOMIC	(__force blk_opf_t)(1ULL << __REQ_ATOMIC)
+#define REQ_P2PDMA	(__force blk_opf_t)(1ULL << __REQ_P2PDMA)
 
 #define REQ_NOUNMAP	(__force blk_opf_t)(1ULL << __REQ_NOUNMAP)