diff mbox

brd: Allow ramdisk to be allocated on selected NUMA node

Message ID 20180614133832.110947-1-hare@suse.de (mailing list archive)
State New, archived
Headers show

Commit Message

Hannes Reinecke June 14, 2018, 1:38 p.m. UTC
For performance reasons we should be able to allocate all memory
from a given NUMA node, so this patch adds a new parameter
'rd_numa_node' to allow the user to specify the NUMA node id.
When restricing fio to use the same NUMA node I'm seeing a performance
boost of more than 200%.

Signed-off-by: Hannes Reinecke <hare@suse.com>
---
 drivers/block/brd.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

Comments

Jens Axboe June 14, 2018, 2:47 p.m. UTC | #1
On 6/14/18 7:38 AM, Hannes Reinecke wrote:
> For performance reasons we should be able to allocate all memory
> from a given NUMA node, so this patch adds a new parameter
> 'rd_numa_node' to allow the user to specify the NUMA node id.
> When restricing fio to use the same NUMA node I'm seeing a performance
> boost of more than 200%.

Looks fine to me. One comment.

> @@ -342,6 +343,10 @@ static int max_part = 1;
>  module_param(max_part, int, 0444);
>  MODULE_PARM_DESC(max_part, "Num Minors to reserve between devices");
>  
> +static int rd_numa_node = NUMA_NO_NODE;
> +module_param(rd_numa_node, int, 0444);
> +MODULE_PARM_DESC(rd_numa_node, "NUMA node number to allocate RAM disk on.");

This could feasibly be 0644, as there would be nothing wrong with altering
this at runtime.
Hannes Reinecke June 14, 2018, 3:29 p.m. UTC | #2
On Thu, 14 Jun 2018 08:47:33 -0600
Jens Axboe <axboe@kernel.dk> wrote:

> On 6/14/18 7:38 AM, Hannes Reinecke wrote:
> > For performance reasons we should be able to allocate all memory
> > from a given NUMA node, so this patch adds a new parameter
> > 'rd_numa_node' to allow the user to specify the NUMA node id.
> > When restricing fio to use the same NUMA node I'm seeing a
> > performance boost of more than 200%.  
> 
> Looks fine to me. One comment.
> 
> > @@ -342,6 +343,10 @@ static int max_part = 1;
> >  module_param(max_part, int, 0444);
> >  MODULE_PARM_DESC(max_part, "Num Minors to reserve between
> > devices"); 
> > +static int rd_numa_node = NUMA_NO_NODE;
> > +module_param(rd_numa_node, int, 0444);
> > +MODULE_PARM_DESC(rd_numa_node, "NUMA node number to allocate RAM
> > disk on.");  
> 
> This could feasibly be 0644, as there would be nothing wrong with
> altering this at runtime.
> 

While we could it would not change the allocation of _existing_ ram
devices, making behaviour rather unpredictable.
Hence I did decide against it (and yes, I actually thought about it).

But if you insist ...

Cheers,

Hannes
Jens Axboe June 14, 2018, 3:33 p.m. UTC | #3
On 6/14/18 9:29 AM, Hannes Reinecke wrote:
> On Thu, 14 Jun 2018 08:47:33 -0600
> Jens Axboe <axboe@kernel.dk> wrote:
> 
>> On 6/14/18 7:38 AM, Hannes Reinecke wrote:
>>> For performance reasons we should be able to allocate all memory
>>> from a given NUMA node, so this patch adds a new parameter
>>> 'rd_numa_node' to allow the user to specify the NUMA node id.
>>> When restricing fio to use the same NUMA node I'm seeing a
>>> performance boost of more than 200%.  
>>
>> Looks fine to me. One comment.
>>
>>> @@ -342,6 +343,10 @@ static int max_part = 1;
>>>  module_param(max_part, int, 0444);
>>>  MODULE_PARM_DESC(max_part, "Num Minors to reserve between
>>> devices"); 
>>> +static int rd_numa_node = NUMA_NO_NODE;
>>> +module_param(rd_numa_node, int, 0444);
>>> +MODULE_PARM_DESC(rd_numa_node, "NUMA node number to allocate RAM
>>> disk on.");  
>>
>> This could feasibly be 0644, as there would be nothing wrong with
>> altering this at runtime.
>>
> 
> While we could it would not change the allocation of _existing_ ram
> devices, making behaviour rather unpredictable.
> Hence I did decide against it (and yes, I actually thought about it).
> 
> But if you insist ...

Right, it would just change new allocations. Probably not a common use
case, but there's really nothing that prevents it from being feasible.

Next question - what does the memory allocator do if we run out of
memory on the given node? Should we punt to a different node if that
happens? Slower, but functional, seems preferable to not being able
to get memory.
Hannes Reinecke June 14, 2018, 4:09 p.m. UTC | #4
On Thu, 14 Jun 2018 09:33:35 -0600
Jens Axboe <axboe@kernel.dk> wrote:

> On 6/14/18 9:29 AM, Hannes Reinecke wrote:
> > On Thu, 14 Jun 2018 08:47:33 -0600
> > Jens Axboe <axboe@kernel.dk> wrote:
> >   
> >> On 6/14/18 7:38 AM, Hannes Reinecke wrote:  
> >>> For performance reasons we should be able to allocate all memory
> >>> from a given NUMA node, so this patch adds a new parameter
> >>> 'rd_numa_node' to allow the user to specify the NUMA node id.
> >>> When restricing fio to use the same NUMA node I'm seeing a
> >>> performance boost of more than 200%.    
> >>
> >> Looks fine to me. One comment.
> >>  
> >>> @@ -342,6 +343,10 @@ static int max_part = 1;
> >>>  module_param(max_part, int, 0444);
> >>>  MODULE_PARM_DESC(max_part, "Num Minors to reserve between
> >>> devices"); 
> >>> +static int rd_numa_node = NUMA_NO_NODE;
> >>> +module_param(rd_numa_node, int, 0444);
> >>> +MODULE_PARM_DESC(rd_numa_node, "NUMA node number to allocate RAM
> >>> disk on.");    
> >>
> >> This could feasibly be 0644, as there would be nothing wrong with
> >> altering this at runtime.
> >>  
> > 
> > While we could it would not change the allocation of _existing_ ram
> > devices, making behaviour rather unpredictable.
> > Hence I did decide against it (and yes, I actually thought about
> > it).
> > 
> > But if you insist ...  
> 
> Right, it would just change new allocations. Probably not a common use
> case, but there's really nothing that prevents it from being feasible.
> 
> Next question - what does the memory allocator do if we run out of
> memory on the given node? Should we punt to a different node if that
> happens? Slower, but functional, seems preferable to not being able
> to get memory.
> 

Hmm. That I haven't considered; yes, that really sounds like an idea.
Will be sending an updated patch.

Cheers,

Hannes
Adam Manzanares June 14, 2018, 8:32 p.m. UTC | #5
On 6/14/18 9:09 AM, Hannes Reinecke wrote:
> On Thu, 14 Jun 2018 09:33:35 -0600

> Jens Axboe <axboe@kernel.dk> wrote:

> 

>> On 6/14/18 9:29 AM, Hannes Reinecke wrote:

>>> On Thu, 14 Jun 2018 08:47:33 -0600

>>> Jens Axboe <axboe@kernel.dk> wrote:

>>>    

>>>> On 6/14/18 7:38 AM, Hannes Reinecke wrote:

>>>>> For performance reasons we should be able to allocate all memory

>>>>> from a given NUMA node, so this patch adds a new parameter

>>>>> 'rd_numa_node' to allow the user to specify the NUMA node id.

>>>>> When restricing fio to use the same NUMA node I'm seeing a

>>>>> performance boost of more than 200%.

>>>>

>>>> Looks fine to me. One comment.

>>>>   

>>>>> @@ -342,6 +343,10 @@ static int max_part = 1;

>>>>>   module_param(max_part, int, 0444);

>>>>>   MODULE_PARM_DESC(max_part, "Num Minors to reserve between

>>>>> devices");

>>>>> +static int rd_numa_node = NUMA_NO_NODE;

>>>>> +module_param(rd_numa_node, int, 0444);

>>>>> +MODULE_PARM_DESC(rd_numa_node, "NUMA node number to allocate RAM

>>>>> disk on.");

>>>>

>>>> This could feasibly be 0644, as there would be nothing wrong with

>>>> altering this at runtime.

>>>>   

>>>

>>> While we could it would not change the allocation of _existing_ ram

>>> devices, making behaviour rather unpredictable.

>>> Hence I did decide against it (and yes, I actually thought about

>>> it).

>>>

>>> But if you insist ...

>>

>> Right, it would just change new allocations. Probably not a common use

>> case, but there's really nothing that prevents it from being feasible.

>>

>> Next question - what does the memory allocator do if we run out of

>> memory on the given node? Should we punt to a different node if that

>> happens? Slower, but functional, seems preferable to not being able

>> to get memory.

>>

> 

> Hmm. That I haven't considered; yes, that really sounds like an idea.

> Will be sending an updated patch.


Will numactl ... modprobe brd ... solve this problem?

> 

> Cheers,

> 

> Hannes

>
Jens Axboe June 14, 2018, 8:37 p.m. UTC | #6
On 6/14/18 2:32 PM, Adam Manzanares wrote:
> 
> 
> On 6/14/18 9:09 AM, Hannes Reinecke wrote:
>> On Thu, 14 Jun 2018 09:33:35 -0600
>> Jens Axboe <axboe@kernel.dk> wrote:
>>
>>> On 6/14/18 9:29 AM, Hannes Reinecke wrote:
>>>> On Thu, 14 Jun 2018 08:47:33 -0600
>>>> Jens Axboe <axboe@kernel.dk> wrote:
>>>>    
>>>>> On 6/14/18 7:38 AM, Hannes Reinecke wrote:
>>>>>> For performance reasons we should be able to allocate all memory
>>>>>> from a given NUMA node, so this patch adds a new parameter
>>>>>> 'rd_numa_node' to allow the user to specify the NUMA node id.
>>>>>> When restricing fio to use the same NUMA node I'm seeing a
>>>>>> performance boost of more than 200%.
>>>>>
>>>>> Looks fine to me. One comment.
>>>>>   
>>>>>> @@ -342,6 +343,10 @@ static int max_part = 1;
>>>>>>   module_param(max_part, int, 0444);
>>>>>>   MODULE_PARM_DESC(max_part, "Num Minors to reserve between
>>>>>> devices");
>>>>>> +static int rd_numa_node = NUMA_NO_NODE;
>>>>>> +module_param(rd_numa_node, int, 0444);
>>>>>> +MODULE_PARM_DESC(rd_numa_node, "NUMA node number to allocate RAM
>>>>>> disk on.");
>>>>>
>>>>> This could feasibly be 0644, as there would be nothing wrong with
>>>>> altering this at runtime.
>>>>>   
>>>>
>>>> While we could it would not change the allocation of _existing_ ram
>>>> devices, making behaviour rather unpredictable.
>>>> Hence I did decide against it (and yes, I actually thought about
>>>> it).
>>>>
>>>> But if you insist ...
>>>
>>> Right, it would just change new allocations. Probably not a common use
>>> case, but there's really nothing that prevents it from being feasible.
>>>
>>> Next question - what does the memory allocator do if we run out of
>>> memory on the given node? Should we punt to a different node if that
>>> happens? Slower, but functional, seems preferable to not being able
>>> to get memory.
>>>
>>
>> Hmm. That I haven't considered; yes, that really sounds like an idea.
>> Will be sending an updated patch.
> 
> Will numactl ... modprobe brd ... solve this problem?

It won't, pages are allocated as needed.
Adam Manzanares June 14, 2018, 8:41 p.m. UTC | #7
On 6/14/18 1:37 PM, Jens Axboe wrote:
> On 6/14/18 2:32 PM, Adam Manzanares wrote:

>>

>>

>> On 6/14/18 9:09 AM, Hannes Reinecke wrote:

>>> On Thu, 14 Jun 2018 09:33:35 -0600

>>> Jens Axboe <axboe@kernel.dk> wrote:

>>>

>>>> On 6/14/18 9:29 AM, Hannes Reinecke wrote:

>>>>> On Thu, 14 Jun 2018 08:47:33 -0600

>>>>> Jens Axboe <axboe@kernel.dk> wrote:

>>>>>     

>>>>>> On 6/14/18 7:38 AM, Hannes Reinecke wrote:

>>>>>>> For performance reasons we should be able to allocate all memory

>>>>>>> from a given NUMA node, so this patch adds a new parameter

>>>>>>> 'rd_numa_node' to allow the user to specify the NUMA node id.

>>>>>>> When restricing fio to use the same NUMA node I'm seeing a

>>>>>>> performance boost of more than 200%.

>>>>>>

>>>>>> Looks fine to me. One comment.

>>>>>>    

>>>>>>> @@ -342,6 +343,10 @@ static int max_part = 1;

>>>>>>>    module_param(max_part, int, 0444);

>>>>>>>    MODULE_PARM_DESC(max_part, "Num Minors to reserve between

>>>>>>> devices");

>>>>>>> +static int rd_numa_node = NUMA_NO_NODE;

>>>>>>> +module_param(rd_numa_node, int, 0444);

>>>>>>> +MODULE_PARM_DESC(rd_numa_node, "NUMA node number to allocate RAM

>>>>>>> disk on.");

>>>>>>

>>>>>> This could feasibly be 0644, as there would be nothing wrong with

>>>>>> altering this at runtime.

>>>>>>    

>>>>>

>>>>> While we could it would not change the allocation of _existing_ ram

>>>>> devices, making behaviour rather unpredictable.

>>>>> Hence I did decide against it (and yes, I actually thought about

>>>>> it).

>>>>>

>>>>> But if you insist ...

>>>>

>>>> Right, it would just change new allocations. Probably not a common use

>>>> case, but there's really nothing that prevents it from being feasible.

>>>>

>>>> Next question - what does the memory allocator do if we run out of

>>>> memory on the given node? Should we punt to a different node if that

>>>> happens? Slower, but functional, seems preferable to not being able

>>>> to get memory.

>>>>

>>>

>>> Hmm. That I haven't considered; yes, that really sounds like an idea.

>>> Will be sending an updated patch.

>>

>> Will numactl ... modprobe brd ... solve this problem?

> 

> It won't, pages are allocated as needed.

> 


Then how about a numactl ... dd /dev/ram ... after the modprobe.
Jens Axboe June 14, 2018, 8:47 p.m. UTC | #8
On 6/14/18 2:41 PM, Adam Manzanares wrote:
> 
> 
> On 6/14/18 1:37 PM, Jens Axboe wrote:
>> On 6/14/18 2:32 PM, Adam Manzanares wrote:
>>>
>>>
>>> On 6/14/18 9:09 AM, Hannes Reinecke wrote:
>>>> On Thu, 14 Jun 2018 09:33:35 -0600
>>>> Jens Axboe <axboe@kernel.dk> wrote:
>>>>
>>>>> On 6/14/18 9:29 AM, Hannes Reinecke wrote:
>>>>>> On Thu, 14 Jun 2018 08:47:33 -0600
>>>>>> Jens Axboe <axboe@kernel.dk> wrote:
>>>>>>     
>>>>>>> On 6/14/18 7:38 AM, Hannes Reinecke wrote:
>>>>>>>> For performance reasons we should be able to allocate all memory
>>>>>>>> from a given NUMA node, so this patch adds a new parameter
>>>>>>>> 'rd_numa_node' to allow the user to specify the NUMA node id.
>>>>>>>> When restricing fio to use the same NUMA node I'm seeing a
>>>>>>>> performance boost of more than 200%.
>>>>>>>
>>>>>>> Looks fine to me. One comment.
>>>>>>>    
>>>>>>>> @@ -342,6 +343,10 @@ static int max_part = 1;
>>>>>>>>    module_param(max_part, int, 0444);
>>>>>>>>    MODULE_PARM_DESC(max_part, "Num Minors to reserve between
>>>>>>>> devices");
>>>>>>>> +static int rd_numa_node = NUMA_NO_NODE;
>>>>>>>> +module_param(rd_numa_node, int, 0444);
>>>>>>>> +MODULE_PARM_DESC(rd_numa_node, "NUMA node number to allocate RAM
>>>>>>>> disk on.");
>>>>>>>
>>>>>>> This could feasibly be 0644, as there would be nothing wrong with
>>>>>>> altering this at runtime.
>>>>>>>    
>>>>>>
>>>>>> While we could it would not change the allocation of _existing_ ram
>>>>>> devices, making behaviour rather unpredictable.
>>>>>> Hence I did decide against it (and yes, I actually thought about
>>>>>> it).
>>>>>>
>>>>>> But if you insist ...
>>>>>
>>>>> Right, it would just change new allocations. Probably not a common use
>>>>> case, but there's really nothing that prevents it from being feasible.
>>>>>
>>>>> Next question - what does the memory allocator do if we run out of
>>>>> memory on the given node? Should we punt to a different node if that
>>>>> happens? Slower, but functional, seems preferable to not being able
>>>>> to get memory.
>>>>>
>>>>
>>>> Hmm. That I haven't considered; yes, that really sounds like an idea.
>>>> Will be sending an updated patch.
>>>
>>> Will numactl ... modprobe brd ... solve this problem?
>>
>> It won't, pages are allocated as needed.
>>
> 
> Then how about a numactl ... dd /dev/ram ... after the modprobe.

Yes of course, or you could do that for every application that ends
up in the path of the doing IO to it. The point of the option is to
just make it explicit, and not have to either NUMA pin each task,
or prefill all possible pages.
Adam Manzanares June 14, 2018, 8:53 p.m. UTC | #9
On 6/14/18 1:47 PM, Jens Axboe wrote:
> On 6/14/18 2:41 PM, Adam Manzanares wrote:

>>

>>

>> On 6/14/18 1:37 PM, Jens Axboe wrote:

>>> On 6/14/18 2:32 PM, Adam Manzanares wrote:

>>>>

>>>>

>>>> On 6/14/18 9:09 AM, Hannes Reinecke wrote:

>>>>> On Thu, 14 Jun 2018 09:33:35 -0600

>>>>> Jens Axboe <axboe@kernel.dk> wrote:

>>>>>

>>>>>> On 6/14/18 9:29 AM, Hannes Reinecke wrote:

>>>>>>> On Thu, 14 Jun 2018 08:47:33 -0600

>>>>>>> Jens Axboe <axboe@kernel.dk> wrote:

>>>>>>>      

>>>>>>>> On 6/14/18 7:38 AM, Hannes Reinecke wrote:

>>>>>>>>> For performance reasons we should be able to allocate all memory

>>>>>>>>> from a given NUMA node, so this patch adds a new parameter

>>>>>>>>> 'rd_numa_node' to allow the user to specify the NUMA node id.

>>>>>>>>> When restricing fio to use the same NUMA node I'm seeing a

>>>>>>>>> performance boost of more than 200%.

>>>>>>>>

>>>>>>>> Looks fine to me. One comment.

>>>>>>>>     

>>>>>>>>> @@ -342,6 +343,10 @@ static int max_part = 1;

>>>>>>>>>     module_param(max_part, int, 0444);

>>>>>>>>>     MODULE_PARM_DESC(max_part, "Num Minors to reserve between

>>>>>>>>> devices");

>>>>>>>>> +static int rd_numa_node = NUMA_NO_NODE;

>>>>>>>>> +module_param(rd_numa_node, int, 0444);

>>>>>>>>> +MODULE_PARM_DESC(rd_numa_node, "NUMA node number to allocate RAM

>>>>>>>>> disk on.");

>>>>>>>>

>>>>>>>> This could feasibly be 0644, as there would be nothing wrong with

>>>>>>>> altering this at runtime.

>>>>>>>>     

>>>>>>>

>>>>>>> While we could it would not change the allocation of _existing_ ram

>>>>>>> devices, making behaviour rather unpredictable.

>>>>>>> Hence I did decide against it (and yes, I actually thought about

>>>>>>> it).

>>>>>>>

>>>>>>> But if you insist ...

>>>>>>

>>>>>> Right, it would just change new allocations. Probably not a common use

>>>>>> case, but there's really nothing that prevents it from being feasible.

>>>>>>

>>>>>> Next question - what does the memory allocator do if we run out of

>>>>>> memory on the given node? Should we punt to a different node if that

>>>>>> happens? Slower, but functional, seems preferable to not being able

>>>>>> to get memory.

>>>>>>

>>>>>

>>>>> Hmm. That I haven't considered; yes, that really sounds like an idea.

>>>>> Will be sending an updated patch.

>>>>

>>>> Will numactl ... modprobe brd ... solve this problem?

>>>

>>> It won't, pages are allocated as needed.

>>>

>>

>> Then how about a numactl ... dd /dev/ram ... after the modprobe.

> 

> Yes of course, or you could do that for every application that ends

> up in the path of the doing IO to it. The point of the option is to

> just make it explicit, and not have to either NUMA pin each task,

> or prefill all possible pages.

> 


Makes sense, I have done some similar benchmarking and had to worry 
about NUMA awareness and the numactl + dd approach seemed to work 
because I wanted to not take a performance hit for page allocation 
during the benchmarking.

Would anyone be interested in forcing the allocations to occur during 
module initialization?
Hannes Reinecke June 15, 2018, 6:06 a.m. UTC | #10
On 06/14/2018 10:53 PM, Adam Manzanares wrote:
[ .. ]
>>>
>>> Then how about a numactl ... dd /dev/ram ... after the modprobe.
>>
>> Yes of course, or you could do that for every application that ends
>> up in the path of the doing IO to it. The point of the option is to
>> just make it explicit, and not have to either NUMA pin each task,
>> or prefill all possible pages.
>>
> 
> Makes sense, I have done some similar benchmarking and had to worry
> about NUMA awareness and the numactl + dd approach seemed to work
> because I wanted to not take a performance hit for page allocation
> during the benchmarking.
> 
> Would anyone be interested in forcing the allocations to occur during
> module initialization?
> 
YES.

Cheers,

Hannes
Christoph Hellwig June 15, 2018, 7:30 a.m. UTC | #11
On Thu, Jun 14, 2018 at 09:33:35AM -0600, Jens Axboe wrote:
> Next question - what does the memory allocator do if we run out of
> memory on the given node? Should we punt to a different node if that
> happens? Slower, but functional, seems preferable to not being able
> to get memory.

When using alloc_pages_node the passed in node id is just a hint, the
allocator will use all avaiable memory if nedeed.
Mel Gorman June 15, 2018, 9:23 a.m. UTC | #12
On Thu, Jun 14, 2018 at 02:47:39PM -0600, Jens Axboe wrote:
> >>> Will numactl ... modprobe brd ... solve this problem?
> >>
> >> It won't, pages are allocated as needed.
> >>
> > 
> > Then how about a numactl ... dd /dev/ram ... after the modprobe.
> 
> Yes of course, or you could do that for every application that ends
> up in the path of the doing IO to it. The point of the option is to
> just make it explicit, and not have to either NUMA pin each task,
> or prefill all possible pages.
> 

It's certainly possible from userspace using dd and numactl setting the
desired memory policy. mmtests has the following snippet when setting
up a benchmark using brd to deal with both NUMA artifacts and variable
performance due to first faults early in the lifetime of a benchmark.

                modprobe brd rd_size=$((TESTDISK_RD_SIZE/1024))
                if [ "$TESTDISK_RD_PREALLOC" == "yes" ]; then
                        if [ "$TESTDISK_RD_PREALLOC_NODE" != "" ]; then
                                tmp_prealloc_cmd="numactl -N $TESTDISK_RD_PREALLOC_NODE"
                        else
                                tmp_prealloc_cmd="numactl -i all"
                        fi
                        $tmp_prealloc_cmd dd if=/dev/zero of=/dev/ram0 bs=1M &>/dev/null
                fi

(Haven't actually validated this in a long time but it worked at some point)

First option allocates just from one node, the other interleaves between
everything. Any combination of nodes or policies can be used and this was
very simple, but it's what was needed at the time. The question is how
far do you want to go with supporting policies within the module?

One option would be to keep this very simple like the patch suggests so users
get the hint that it's even worth considering and then point at a document
on how to do more complex policies from userspace at device creation time.
Another is simply to document the hazard that the locality of memory is
controlled by the memory policy of the first task that touches it.
Bart Van Assche June 15, 2018, 2:07 p.m. UTC | #13
On Thu, 2018-06-14 at 15:38 +0200, Hannes Reinecke wrote:
> For performance reasons we should be able to allocate all memory

> from a given NUMA node, so this patch adds a new parameter

> 'rd_numa_node' to allow the user to specify the NUMA node id.

> When restricing fio to use the same NUMA node I'm seeing a performance

> boost of more than 200%.


Passing this information through a kernel module parameter to the brd kernel
module seems wrong to me. There can be multiple brd instances. Using a kernel
module parameter makes it impossible to specify a different NUMA node for
different brd instances.

Bart.
Jens Axboe June 15, 2018, 2:12 p.m. UTC | #14
On 6/15/18 1:30 AM, Christoph Hellwig wrote:
> On Thu, Jun 14, 2018 at 09:33:35AM -0600, Jens Axboe wrote:
>> Next question - what does the memory allocator do if we run out of
>> memory on the given node? Should we punt to a different node if that
>> happens? Slower, but functional, seems preferable to not being able
>> to get memory.
> 
> When using alloc_pages_node the passed in node id is just a hint, the
> allocator will use all avaiable memory if nedeed.

OK good, that's not a concern then.
Jens Axboe June 15, 2018, 2:28 p.m. UTC | #15
On 6/15/18 3:23 AM, Mel Gorman wrote:
> On Thu, Jun 14, 2018 at 02:47:39PM -0600, Jens Axboe wrote:
>>>>> Will numactl ... modprobe brd ... solve this problem?
>>>>
>>>> It won't, pages are allocated as needed.
>>>>
>>>
>>> Then how about a numactl ... dd /dev/ram ... after the modprobe.
>>
>> Yes of course, or you could do that for every application that ends
>> up in the path of the doing IO to it. The point of the option is to
>> just make it explicit, and not have to either NUMA pin each task,
>> or prefill all possible pages.
>>
> 
> It's certainly possible from userspace using dd and numactl setting the
> desired memory policy. mmtests has the following snippet when setting
> up a benchmark using brd to deal with both NUMA artifacts and variable
> performance due to first faults early in the lifetime of a benchmark.
> 
>                 modprobe brd rd_size=$((TESTDISK_RD_SIZE/1024))
>                 if [ "$TESTDISK_RD_PREALLOC" == "yes" ]; then
>                         if [ "$TESTDISK_RD_PREALLOC_NODE" != "" ]; then
>                                 tmp_prealloc_cmd="numactl -N $TESTDISK_RD_PREALLOC_NODE"
>                         else
>                                 tmp_prealloc_cmd="numactl -i all"
>                         fi
>                         $tmp_prealloc_cmd dd if=/dev/zero of=/dev/ram0 bs=1M &>/dev/null
>                 fi
> 
> (Haven't actually validated this in a long time but it worked at some point)

You'd want to make this oflag=direct as well (this goes for Adam, too), or
you could have pages being written that are NOT issued by dd.

> First option allocates just from one node, the other interleaves between
> everything. Any combination of nodes or policies can be used and this was
> very simple, but it's what was needed at the time. The question is how
> far do you want to go with supporting policies within the module?

Not far, imho :-)

> One option would be to keep this very simple like the patch suggests so users
> get the hint that it's even worth considering and then point at a document
> on how to do more complex policies from userspace at device creation time.
> Another is simply to document the hazard that the locality of memory is
> controlled by the memory policy of the first task that touches it.

I like the simple option, especially since (as Christoph pointed out) that
if we fail allocating from the given node, then we'll just go elsewhere.
Hannes Reinecke June 15, 2018, 4:55 p.m. UTC | #16
On 06/15/2018 04:07 PM, Bart Van Assche wrote:
> On Thu, 2018-06-14 at 15:38 +0200, Hannes Reinecke wrote:
>> For performance reasons we should be able to allocate all memory
>> from a given NUMA node, so this patch adds a new parameter
>> 'rd_numa_node' to allow the user to specify the NUMA node id.
>> When restricing fio to use the same NUMA node I'm seeing a performance
>> boost of more than 200%.
> 
> Passing this information through a kernel module parameter to the brd kernel
> module seems wrong to me. There can be multiple brd instances. Using a kernel
> module parameter makes it impossible to specify a different NUMA node for
> different brd instances.
> 
This patch has primarily done for simplicity; all the existing brd 
parameters affect _all_ ramdisks, so this patch keeps that style.

If you want soemthing more fine-grained you could use the approach 
suggested by Mel Gorman and use 'numactl' to pre-fill the ramdisk via 'dd'.

Cheers,

Hannes
Bart Van Assche June 15, 2018, 4:58 p.m. UTC | #17
On Fri, 2018-06-15 at 18:55 +0200, Hannes Reinecke wrote:
> On 06/15/2018 04:07 PM, Bart Van Assche wrote:

> > On Thu, 2018-06-14 at 15:38 +0200, Hannes Reinecke wrote:

> > > For performance reasons we should be able to allocate all memory

> > > from a given NUMA node, so this patch adds a new parameter

> > > 'rd_numa_node' to allow the user to specify the NUMA node id.

> > > When restricing fio to use the same NUMA node I'm seeing a performance

> > > boost of more than 200%.

> > 

> > Passing this information through a kernel module parameter to the brd kernel

> > module seems wrong to me. There can be multiple brd instances. Using a kernel

> > module parameter makes it impossible to specify a different NUMA node for

> > different brd instances.

> 

> This patch has primarily done for simplicity; all the existing brd 

> parameters affect _all_ ramdisks, so this patch keeps that style.

> 

> If you want soemthing more fine-grained you could use the approach 

> suggested by Mel Gorman and use 'numactl' to pre-fill the ramdisk via 'dd'.


That's a cumbersome approach, illustrated by the fact that Mel forgot to use
direct writes in his examples. If Mel overlooked that more people will overlook
to use direct writes.

Bart.
diff mbox

Patch

diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index bb976598ee43..7142d836539e 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -36,6 +36,7 @@ 
  */
 struct brd_device {
 	int		brd_number;
+	int		brd_numa_node;
 
 	struct request_queue	*brd_queue;
 	struct gendisk		*brd_disk;
@@ -103,7 +104,7 @@  static struct page *brd_insert_page(struct brd_device *brd, sector_t sector)
 	 * restriction might be able to be lifted.
 	 */
 	gfp_flags = GFP_NOIO | __GFP_ZERO;
-	page = alloc_page(gfp_flags);
+	page = alloc_pages_node(brd->brd_numa_node, gfp_flags, 0);
 	if (!page)
 		return NULL;
 
@@ -342,6 +343,10 @@  static int max_part = 1;
 module_param(max_part, int, 0444);
 MODULE_PARM_DESC(max_part, "Num Minors to reserve between devices");
 
+static int rd_numa_node = NUMA_NO_NODE;
+module_param(rd_numa_node, int, 0444);
+MODULE_PARM_DESC(rd_numa_node, "NUMA node number to allocate RAM disk on.");
+
 MODULE_LICENSE("GPL");
 MODULE_ALIAS_BLOCKDEV_MAJOR(RAMDISK_MAJOR);
 MODULE_ALIAS("rd");
@@ -363,7 +368,7 @@  __setup("ramdisk_size=", ramdisk_size);
 static LIST_HEAD(brd_devices);
 static DEFINE_MUTEX(brd_devices_mutex);
 
-static struct brd_device *brd_alloc(int i)
+static struct brd_device *brd_alloc(int i, int node)
 {
 	struct brd_device *brd;
 	struct gendisk *disk;
@@ -372,10 +377,11 @@  static struct brd_device *brd_alloc(int i)
 	if (!brd)
 		goto out;
 	brd->brd_number		= i;
+	brd->brd_numa_node = node;
 	spin_lock_init(&brd->brd_lock);
 	INIT_RADIX_TREE(&brd->brd_pages, GFP_ATOMIC);
 
-	brd->brd_queue = blk_alloc_queue(GFP_KERNEL);
+	brd->brd_queue = blk_alloc_queue_node(GFP_KERNEL, node, NULL);
 	if (!brd->brd_queue)
 		goto out_free_dev;
 
@@ -434,7 +440,7 @@  static struct brd_device *brd_init_one(int i, bool *new)
 			goto out;
 	}
 
-	brd = brd_alloc(i);
+	brd = brd_alloc(i, rd_numa_node);
 	if (brd) {
 		add_disk(brd->brd_disk);
 		list_add_tail(&brd->brd_list, &brd_devices);
@@ -495,7 +501,7 @@  static int __init brd_init(void)
 		max_part = 1;
 
 	for (i = 0; i < rd_nr; i++) {
-		brd = brd_alloc(i);
+		brd = brd_alloc(i, rd_numa_node);
 		if (!brd)
 			goto out_free;
 		list_add_tail(&brd->brd_list, &brd_devices);