diff mbox series

[10/14] blk-mq: initial support for multiple queue maps

Message ID 20181029163738.10172-11-axboe@kernel.dk (mailing list archive)
State Superseded
Headers show
Series blk-mq: Add support for multiple queue maps | expand

Commit Message

Jens Axboe Oct. 29, 2018, 4:37 p.m. UTC
Add a queue offset to the tag map. This enables users to map
iteratively, for each queue map type they support.

Bump maximum number of supported maps to 2, we're now fully
able to support more than 1 map.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-cpumap.c  | 9 +++++----
 block/blk-mq-pci.c     | 2 +-
 block/blk-mq-virtio.c  | 2 +-
 include/linux/blk-mq.h | 3 ++-
 4 files changed, 9 insertions(+), 7 deletions(-)

Comments

Bart Van Assche Oct. 29, 2018, 7:40 p.m. UTC | #1
On Mon, 2018-10-29 at 10:37 -0600, Jens Axboe wrote:
> -static int cpu_to_queue_index(unsigned int nr_queues, const int cpu)
> +static int cpu_to_queue_index(struct blk_mq_queue_map *qmap,
> +			      unsigned int nr_queues, const int cpu)
>  {
> -	return cpu % nr_queues;
> +	return qmap->queue_offset + (cpu % nr_queues);
>  }
> 
> [ ... ]
>  
> --- a/include/linux/blk-mq.h
> +++ b/include/linux/blk-mq.h
> @@ -78,10 +78,11 @@ struct blk_mq_hw_ctx {
>  struct blk_mq_queue_map {
>  	unsigned int *mq_map;
>  	unsigned int nr_queues;
> +	unsigned int queue_offset;
>  };

I think it's unfortunate that the blk-mq core uses the .queue_offset member but
that mapping functions in block drivers are responsible for setting that member.
Since the block driver mapping functions have to set blk_mq_queue_map.nr_queues,
how about adding a loop in blk_mq_update_queue_map() that derives .queue_offset
from .nr_queues from previous array entries?

Thanks,

Bart.
Jens Axboe Oct. 29, 2018, 7:53 p.m. UTC | #2
On 10/29/18 1:40 PM, Bart Van Assche wrote:
> On Mon, 2018-10-29 at 10:37 -0600, Jens Axboe wrote:
>> -static int cpu_to_queue_index(unsigned int nr_queues, const int cpu)
>> +static int cpu_to_queue_index(struct blk_mq_queue_map *qmap,
>> +			      unsigned int nr_queues, const int cpu)
>>  {
>> -	return cpu % nr_queues;
>> +	return qmap->queue_offset + (cpu % nr_queues);
>>  }
>>
>> [ ... ]
>>  
>> --- a/include/linux/blk-mq.h
>> +++ b/include/linux/blk-mq.h
>> @@ -78,10 +78,11 @@ struct blk_mq_hw_ctx {
>>  struct blk_mq_queue_map {
>>  	unsigned int *mq_map;
>>  	unsigned int nr_queues;
>> +	unsigned int queue_offset;
>>  };
> 
> I think it's unfortunate that the blk-mq core uses the .queue_offset member but
> that mapping functions in block drivers are responsible for setting that member.
> Since the block driver mapping functions have to set blk_mq_queue_map.nr_queues,
> how about adding a loop in blk_mq_update_queue_map() that derives .queue_offset
> from .nr_queues from previous array entries?

It's not a simple increment, so the driver has to be the one setting it. If
we end up sharing queues, for instance, then the driver will need to set
it to the start offset of that set. If you go two patches forward you
can see that exact construct.

IOW, it's the driver that controls the offset, not the core.
Bart Van Assche Oct. 29, 2018, 8 p.m. UTC | #3
On Mon, 2018-10-29 at 13:53 -0600, Jens Axboe wrote:
> On 10/29/18 1:40 PM, Bart Van Assche wrote:
> > On Mon, 2018-10-29 at 10:37 -0600, Jens Axboe wrote:
> > > -static int cpu_to_queue_index(unsigned int nr_queues, const int cpu)
> > > +static int cpu_to_queue_index(struct blk_mq_queue_map *qmap,
> > > +			      unsigned int nr_queues, const int cpu)
> > >  {
> > > -	return cpu % nr_queues;
> > > +	return qmap->queue_offset + (cpu % nr_queues);
> > >  }
> > > 
> > > [ ... ]
> > >  
> > > --- a/include/linux/blk-mq.h
> > > +++ b/include/linux/blk-mq.h
> > > @@ -78,10 +78,11 @@ struct blk_mq_hw_ctx {
> > >  struct blk_mq_queue_map {
> > >  	unsigned int *mq_map;
> > >  	unsigned int nr_queues;
> > > +	unsigned int queue_offset;
> > >  };
> > 
> > I think it's unfortunate that the blk-mq core uses the .queue_offset member but
> > that mapping functions in block drivers are responsible for setting that member.
> > Since the block driver mapping functions have to set blk_mq_queue_map.nr_queues,
> > how about adding a loop in blk_mq_update_queue_map() that derives .queue_offset
> > from .nr_queues from previous array entries?
> 
> It's not a simple increment, so the driver has to be the one setting it. If
> we end up sharing queues, for instance, then the driver will need to set
> it to the start offset of that set. If you go two patches forward you
> can see that exact construct.
> 
> IOW, it's the driver that controls the offset, not the core.

If sharing of hardware queues between hardware queue types is supported,
what should hctx->type be set to? Additionally, patch 5 adds code that uses
hctx->type as an array index. How can that code work if a single hardware
queue can be shared by multiple hardware queue types?

Thanks,

Bart.
Jens Axboe Oct. 29, 2018, 8:09 p.m. UTC | #4
On 10/29/18 2:00 PM, Bart Van Assche wrote:
> On Mon, 2018-10-29 at 13:53 -0600, Jens Axboe wrote:
>> On 10/29/18 1:40 PM, Bart Van Assche wrote:
>>> On Mon, 2018-10-29 at 10:37 -0600, Jens Axboe wrote:
>>>> -static int cpu_to_queue_index(unsigned int nr_queues, const int cpu)
>>>> +static int cpu_to_queue_index(struct blk_mq_queue_map *qmap,
>>>> +			      unsigned int nr_queues, const int cpu)
>>>>  {
>>>> -	return cpu % nr_queues;
>>>> +	return qmap->queue_offset + (cpu % nr_queues);
>>>>  }
>>>>
>>>> [ ... ]
>>>>  
>>>> --- a/include/linux/blk-mq.h
>>>> +++ b/include/linux/blk-mq.h
>>>> @@ -78,10 +78,11 @@ struct blk_mq_hw_ctx {
>>>>  struct blk_mq_queue_map {
>>>>  	unsigned int *mq_map;
>>>>  	unsigned int nr_queues;
>>>> +	unsigned int queue_offset;
>>>>  };
>>>
>>> I think it's unfortunate that the blk-mq core uses the .queue_offset member but
>>> that mapping functions in block drivers are responsible for setting that member.
>>> Since the block driver mapping functions have to set blk_mq_queue_map.nr_queues,
>>> how about adding a loop in blk_mq_update_queue_map() that derives .queue_offset
>>> from .nr_queues from previous array entries?
>>
>> It's not a simple increment, so the driver has to be the one setting it. If
>> we end up sharing queues, for instance, then the driver will need to set
>> it to the start offset of that set. If you go two patches forward you
>> can see that exact construct.
>>
>> IOW, it's the driver that controls the offset, not the core.
> 
> If sharing of hardware queues between hardware queue types is supported,
> what should hctx->type be set to? Additionally, patch 5 adds code that uses
> hctx->type as an array index. How can that code work if a single hardware
> queue can be shared by multiple hardware queue types?

hctx->type will be set to the value of the first type. This is all driver
private, blk-mq could not care less what the value of the type means.

As to the other question, it works just fine since that is the queue
that is being accessed. There's no confusion there. I think you're
misunderstanding how it's seutp. To use nvme as the example, type 0
would be reads, 1 writes, and 2 pollable queues. If reads and writes
share the same set of hardware queues, then type 1 simply doesn't
exist in terms of ->flags_to_type() return value. This is purely
driven by the driver. That hook is the only decider of where something
will go. If we share hctx sets, we share the same hardware queue as
well. There is just the one set for that case.
Bart Van Assche Oct. 29, 2018, 8:25 p.m. UTC | #5
On Mon, 2018-10-29 at 14:09 -0600, Jens Axboe wrote:
> hctx->type will be set to the value of the first type. This is all driver
> private, blk-mq could not care less what the value of the type means.
> 
> As to the other question, it works just fine since that is the queue
> that is being accessed. There's no confusion there. I think you're
> misunderstanding how it's seutp. To use nvme as the example, type 0
> would be reads, 1 writes, and 2 pollable queues. If reads and writes
> share the same set of hardware queues, then type 1 simply doesn't
> exist in terms of ->flags_to_type() return value. This is purely
> driven by the driver. That hook is the only decider of where something
> will go. If we share hctx sets, we share the same hardware queue as
> well. There is just the one set for that case.

How about adding a comment in blk-mq.h that explains that hardware queues can
be shared among different hardware queue types? I think this is nontrivial and
deserves a comment.

Thanks,

Bart.
Jens Axboe Oct. 29, 2018, 8:29 p.m. UTC | #6
On 10/29/18 2:25 PM, Bart Van Assche wrote:
> On Mon, 2018-10-29 at 14:09 -0600, Jens Axboe wrote:
>> hctx->type will be set to the value of the first type. This is all driver
>> private, blk-mq could not care less what the value of the type means.
>>
>> As to the other question, it works just fine since that is the queue
>> that is being accessed. There's no confusion there. I think you're
>> misunderstanding how it's seutp. To use nvme as the example, type 0
>> would be reads, 1 writes, and 2 pollable queues. If reads and writes
>> share the same set of hardware queues, then type 1 simply doesn't
>> exist in terms of ->flags_to_type() return value. This is purely
>> driven by the driver. That hook is the only decider of where something
>> will go. If we share hctx sets, we share the same hardware queue as
>> well. There is just the one set for that case.
> 
> How about adding a comment in blk-mq.h that explains that hardware queues can
> be shared among different hardware queue types? I think this is nontrivial and
> deserves a comment.

Sure, I can do that. I guess a key concept that is confusing based on
your above question is that the sets don't have to be consecutive.
It's perfectly valid to have 0 and 2 be the available queues, and
nothing for 1. For example.

BTW, split up the incremental patch, find them here:

http://git.kernel.dk/cgit/linux-block/commit/?h=mq-maps&id=6890d88deecfd3723ce620d82f5fc80485f9caec

and

http://git.kernel.dk/cgit/linux-block/commit/?h=mq-maps&id=907725dff2f8cc6d1502a9123f930b8d3708bd02
diff mbox series

Patch

diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
index 6e6686c55984..03a534820271 100644
--- a/block/blk-mq-cpumap.c
+++ b/block/blk-mq-cpumap.c
@@ -14,9 +14,10 @@ 
 #include "blk.h"
 #include "blk-mq.h"
 
-static int cpu_to_queue_index(unsigned int nr_queues, const int cpu)
+static int cpu_to_queue_index(struct blk_mq_queue_map *qmap,
+			      unsigned int nr_queues, const int cpu)
 {
-	return cpu % nr_queues;
+	return qmap->queue_offset + (cpu % nr_queues);
 }
 
 static int get_first_sibling(unsigned int cpu)
@@ -44,11 +45,11 @@  int blk_mq_map_queues(struct blk_mq_queue_map *qmap)
 		 * performace optimizations.
 		 */
 		if (cpu < nr_queues) {
-			map[cpu] = cpu_to_queue_index(nr_queues, cpu);
+			map[cpu] = cpu_to_queue_index(qmap, nr_queues, cpu);
 		} else {
 			first_sibling = get_first_sibling(cpu);
 			if (first_sibling == cpu)
-				map[cpu] = cpu_to_queue_index(nr_queues, cpu);
+				map[cpu] = cpu_to_queue_index(qmap, nr_queues, cpu);
 			else
 				map[cpu] = map[first_sibling];
 		}
diff --git a/block/blk-mq-pci.c b/block/blk-mq-pci.c
index 40333d60a850..1dce18553984 100644
--- a/block/blk-mq-pci.c
+++ b/block/blk-mq-pci.c
@@ -43,7 +43,7 @@  int blk_mq_pci_map_queues(struct blk_mq_queue_map *qmap, struct pci_dev *pdev,
 			goto fallback;
 
 		for_each_cpu(cpu, mask)
-			qmap->mq_map[cpu] = queue;
+			qmap->mq_map[cpu] = qmap->queue_offset + queue;
 	}
 
 	return 0;
diff --git a/block/blk-mq-virtio.c b/block/blk-mq-virtio.c
index 661fbfef480f..370827163835 100644
--- a/block/blk-mq-virtio.c
+++ b/block/blk-mq-virtio.c
@@ -44,7 +44,7 @@  int blk_mq_virtio_map_queues(struct blk_mq_queue_map *qmap,
 			goto fallback;
 
 		for_each_cpu(cpu, mask)
-			qmap->mq_map[cpu] = queue;
+			qmap->mq_map[cpu] = qmap->queue_offset + queue;
 	}
 
 	return 0;
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 837087cf07cc..b5ae2b5677c1 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -78,10 +78,11 @@  struct blk_mq_hw_ctx {
 struct blk_mq_queue_map {
 	unsigned int *mq_map;
 	unsigned int nr_queues;
+	unsigned int queue_offset;
 };
 
 enum {
-	HCTX_MAX_TYPES = 1,
+	HCTX_MAX_TYPES = 2,
 };
 
 struct blk_mq_tag_set {