diff mbox series

[RFC] arm64/vmalloc: use module region only for module_alloc() if CONFIG_RANDOMIZE_BASE is set

Message ID 20221227092634.445212-1-liushixin2@huawei.com (mailing list archive)
State New
Headers show
Series [RFC] arm64/vmalloc: use module region only for module_alloc() if CONFIG_RANDOMIZE_BASE is set | expand

Commit Message

Liu Shixin Dec. 27, 2022, 9:26 a.m. UTC
After I add a 10GB pmem device, I got the following error message when
insert module:

 insmod: vmalloc error: size 16384, vm_struct allocation failed,
 mode:0xcc0(GFP_KERNEL), nodemask=(null),cpuset=/,mems_allowed=0

If CONFIG_RANDOMIZE_BASE is set, the module region can be located in the
vmalloc region entirely. Although module_alloc() can fall back to a 2GB
window if ARM64_MODULE_PLTS is set, the module region is still easily
exhausted because the module region is located at bottom of vmalloc region
and the vmalloc region is allocated from bottom to top.

Skip module region if not calling from module_alloc().

Signed-off-by: Liu Shixin <liushixin2@huawei.com>
---
 arch/arm64/include/asm/vmalloc.h | 26 ++++++++++++++++++++++++++
 include/linux/vmalloc.h          |  9 +++++++++
 mm/vmalloc.c                     |  4 ++++
 3 files changed, 39 insertions(+)

Comments

Liu Shixin Jan. 29, 2023, 2:44 a.m. UTC | #1
Hi,


This patch seems to have been lost in the corner. Recently I've meet this problem again

on v6.1, so I would like to propose this patch again.


Thanks,


On 2022/12/27 17:26, Liu Shixin wrote:
> After I add a 10GB pmem device, I got the following error message when
> insert module:
>
>  insmod: vmalloc error: size 16384, vm_struct allocation failed,
>  mode:0xcc0(GFP_KERNEL), nodemask=(null),cpuset=/,mems_allowed=0
>
> If CONFIG_RANDOMIZE_BASE is set, the module region can be located in the
> vmalloc region entirely. Although module_alloc() can fall back to a 2GB
> window if ARM64_MODULE_PLTS is set, the module region is still easily
> exhausted because the module region is located at bottom of vmalloc region
> and the vmalloc region is allocated from bottom to top.
>
> Skip module region if not calling from module_alloc().
>
> Signed-off-by: Liu Shixin <liushixin2@huawei.com>
> ---
>  arch/arm64/include/asm/vmalloc.h | 26 ++++++++++++++++++++++++++
>  include/linux/vmalloc.h          |  9 +++++++++
>  mm/vmalloc.c                     |  4 ++++
>  3 files changed, 39 insertions(+)
>
> diff --git a/arch/arm64/include/asm/vmalloc.h b/arch/arm64/include/asm/vmalloc.h
> index 38fafffe699f..4feff546b11b 100644
> --- a/arch/arm64/include/asm/vmalloc.h
> +++ b/arch/arm64/include/asm/vmalloc.h
> @@ -31,4 +31,30 @@ static inline pgprot_t arch_vmap_pgprot_tagged(pgprot_t prot)
>  	return pgprot_tagged(prot);
>  }
>  
> +#ifdef CONFIG_RANDOMIZE_BASE
> +extern u64 module_alloc_base;
> +#define arch_vmap_skip_module_region arch_vmap_skip_module_region
> +static inline void arch_vmap_skip_module_region(unsigned long *addr,
> +						unsigned long vstart,
> +						unsigned long size,
> +						unsigned long align)
> +{
> +	u64 module_alloc_end = module_alloc_base + MODULES_VSIZE;
> +
> +	if (vstart == module_alloc_base)
> +		return;
> +
> +	if (IS_ENABLED(CONFIG_KASAN_GENERIC) ||
> +	    IS_ENABLED(CONFIG_KASAN_SW_TAGS))
> +		/* don't exceed the static module region - see module_alloc() */
> +		module_alloc_end = MODULES_END;
> +
> +	if ((module_alloc_base >= *addr + size) ||
> +	    (module_alloc_end <= *addr))
> +		return;
> +
> +	*addr = ALIGN(module_alloc_end, align);
> +}
> +#endif
> +
>  #endif /* _ASM_ARM64_VMALLOC_H */
> diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
> index 096d48aa3437..55ef97325b84 100644
> --- a/include/linux/vmalloc.h
> +++ b/include/linux/vmalloc.h
> @@ -122,6 +122,15 @@ static inline pgprot_t arch_vmap_pgprot_tagged(pgprot_t prot)
>  }
>  #endif
>  
> +#ifndef arch_vmap_skip_module_region
> +static inline void arch_vmap_skip_module_region(unsigned long *addr,
> +						unsigned long vstart,
> +						unsigned long size,
> +						unsigned long align)
> +{
> +}
> +#endif
> +
>  /*
>   *	Highlevel APIs for driver use
>   */
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index ca71de7c9d77..c840d673052e 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -1236,6 +1236,8 @@ is_within_this_va(struct vmap_area *va, unsigned long size,
>  	else
>  		nva_start_addr = ALIGN(vstart, align);
>  
> +	arch_vmap_skip_module_region(&nva_start_addr, vstart, size, align);
> +
>  	/* Can be overflowed due to big size or alignment. */
>  	if (nva_start_addr + size < nva_start_addr ||
>  			nva_start_addr < vstart)
> @@ -1523,6 +1525,8 @@ __alloc_vmap_area(struct rb_root *root, struct list_head *head,
>  	else
>  		nva_start_addr = ALIGN(vstart, align);
>  
> +	arch_vmap_skip_module_region(&nva_start_addr, vstart, size, align);
> +
>  	/* Check the "vend" restriction. */
>  	if (nva_start_addr + size > vend)
>  		return vend;
Andrew Morton Jan. 29, 2023, 9:41 p.m. UTC | #2
On Sun, 29 Jan 2023 10:44:31 +0800 Liu Shixin <liushixin2@huawei.com> wrote:

> Hi,
> 
> 
> This patch seems to have been lost in the corner. Recently I've meet this problem again
> 
> on v6.1, so I would like to propose this patch again.
> 
> 
> Thanks,
> 
> 
> On 2022/12/27 17:26, Liu Shixin wrote:
> > After I add a 10GB pmem device, I got the following error message when
> > insert module:
> >
> >  insmod: vmalloc error: size 16384, vm_struct allocation failed,
> >  mode:0xcc0(GFP_KERNEL), nodemask=(null),cpuset=/,mems_allowed=0
> >
> > If CONFIG_RANDOMIZE_BASE is set, the module region can be located in the
> > vmalloc region entirely. Although module_alloc() can fall back to a 2GB
> > window if ARM64_MODULE_PLTS is set, the module region is still easily
> > exhausted because the module region is located at bottom of vmalloc region
> > and the vmalloc region is allocated from bottom to top.
> >
> > Skip module region if not calling from module_alloc().
> >

I'll assume this is for the arm tree.

Acked-by: Andrew Morton <akpm@linux-foundation.org>
Will Deacon Jan. 31, 2023, 3:06 p.m. UTC | #3
+Ard -- full thread here:

https://lore.kernel.org/all/20221227092634.445212-1-liushixin2@huawei.com/

On Sun, Jan 29, 2023 at 01:41:47PM -0800, Andrew Morton wrote:
> On Sun, 29 Jan 2023 10:44:31 +0800 Liu Shixin <liushixin2@huawei.com> wrote:
> 
> > Hi,
> > 
> > 
> > This patch seems to have been lost in the corner. Recently I've meet this problem again
> > 
> > on v6.1, so I would like to propose this patch again.
> > 
> > 
> > Thanks,
> > 
> > 
> > On 2022/12/27 17:26, Liu Shixin wrote:
> > > After I add a 10GB pmem device, I got the following error message when
> > > insert module:
> > >
> > >  insmod: vmalloc error: size 16384, vm_struct allocation failed,
> > >  mode:0xcc0(GFP_KERNEL), nodemask=(null),cpuset=/,mems_allowed=0
> > >
> > > If CONFIG_RANDOMIZE_BASE is set, the module region can be located in the
> > > vmalloc region entirely. Although module_alloc() can fall back to a 2GB
> > > window if ARM64_MODULE_PLTS is set, the module region is still easily
> > > exhausted because the module region is located at bottom of vmalloc region
> > > and the vmalloc region is allocated from bottom to top.
> > >
> > > Skip module region if not calling from module_alloc().
> > >
> 
> I'll assume this is for the arm tree.
> 
> Acked-by: Andrew Morton <akpm@linux-foundation.org>

This looks like the same issue previously reported at:

https://lore.kernel.org/all/e6a804de-a5f7-c551-ffba-e09d04e438fc@hisilicon.com/

where Ard had a few suggestions but, afaict, they didn't help.

Will
Will Deacon Jan. 31, 2023, 3:07 p.m. UTC | #4
Now really adding Ard...

On Tue, Jan 31, 2023 at 03:06:44PM +0000, Will Deacon wrote:
> +Ard -- full thread here:
> 
> https://lore.kernel.org/all/20221227092634.445212-1-liushixin2@huawei.com/
> 
> On Sun, Jan 29, 2023 at 01:41:47PM -0800, Andrew Morton wrote:
> > On Sun, 29 Jan 2023 10:44:31 +0800 Liu Shixin <liushixin2@huawei.com> wrote:
> > 
> > > Hi,
> > > 
> > > 
> > > This patch seems to have been lost in the corner. Recently I've meet this problem again
> > > 
> > > on v6.1, so I would like to propose this patch again.
> > > 
> > > 
> > > Thanks,
> > > 
> > > 
> > > On 2022/12/27 17:26, Liu Shixin wrote:
> > > > After I add a 10GB pmem device, I got the following error message when
> > > > insert module:
> > > >
> > > >  insmod: vmalloc error: size 16384, vm_struct allocation failed,
> > > >  mode:0xcc0(GFP_KERNEL), nodemask=(null),cpuset=/,mems_allowed=0
> > > >
> > > > If CONFIG_RANDOMIZE_BASE is set, the module region can be located in the
> > > > vmalloc region entirely. Although module_alloc() can fall back to a 2GB
> > > > window if ARM64_MODULE_PLTS is set, the module region is still easily
> > > > exhausted because the module region is located at bottom of vmalloc region
> > > > and the vmalloc region is allocated from bottom to top.
> > > >
> > > > Skip module region if not calling from module_alloc().
> > > >
> > 
> > I'll assume this is for the arm tree.
> > 
> > Acked-by: Andrew Morton <akpm@linux-foundation.org>
> 
> This looks like the same issue previously reported at:
> 
> https://lore.kernel.org/all/e6a804de-a5f7-c551-ffba-e09d04e438fc@hisilicon.com/
> 
> where Ard had a few suggestions but, afaict, they didn't help.
> 
> Will
Ard Biesheuvel Jan. 31, 2023, 4:03 p.m. UTC | #5
On Tue, 31 Jan 2023 at 16:07, Will Deacon <will@kernel.org> wrote:
>
> Now really adding Ard...
>
> On Tue, Jan 31, 2023 at 03:06:44PM +0000, Will Deacon wrote:
> > +Ard -- full thread here:
> >
> > https://lore.kernel.org/all/20221227092634.445212-1-liushixin2@huawei.com/
> >
> > On Sun, Jan 29, 2023 at 01:41:47PM -0800, Andrew Morton wrote:
> > > On Sun, 29 Jan 2023 10:44:31 +0800 Liu Shixin <liushixin2@huawei.com> wrote:
> > >
> > > > Hi,
> > > >
> > > >
> > > > This patch seems to have been lost in the corner. Recently I've meet this problem again
> > > >
> > > > on v6.1, so I would like to propose this patch again.
> > > >
> > > >
> > > > Thanks,
> > > >
> > > >
> > > > On 2022/12/27 17:26, Liu Shixin wrote:
> > > > > After I add a 10GB pmem device, I got the following error message when
> > > > > insert module:
> > > > >
> > > > >  insmod: vmalloc error: size 16384, vm_struct allocation failed,
> > > > >  mode:0xcc0(GFP_KERNEL), nodemask=(null),cpuset=/,mems_allowed=0
> > > > >
> > > > > If CONFIG_RANDOMIZE_BASE is set, the module region can be located in the
> > > > > vmalloc region entirely. Although module_alloc() can fall back to a 2GB
> > > > > window if ARM64_MODULE_PLTS is set, the module region is still easily
> > > > > exhausted because the module region is located at bottom of vmalloc region
> > > > > and the vmalloc region is allocated from bottom to top.
> > > > >
> > > > > Skip module region if not calling from module_alloc().
> > > > >
> > >
> > > I'll assume this is for the arm tree.
> > >
> > > Acked-by: Andrew Morton <akpm@linux-foundation.org>
> >
> > This looks like the same issue previously reported at:
> >
> > https://lore.kernel.org/all/e6a804de-a5f7-c551-ffba-e09d04e438fc@hisilicon.com/
> >
> > where Ard had a few suggestions but, afaict, they didn't help.
> >

Thanks for the cc.

So this is a bit clunky, and I wonder whether we wouldn't be better
off just splitting the vmalloc region into two separate regions: one
for the kernel and modules, and one for everything else. That way, we
lose one bit of entropy in the randomized placement, but the default
48-bit VA space is vast anway, and even on 39-bit VA configs (such as
Android), I seriously doubt that we come anywhere close to exhausting
the vmalloc space today.
Liu Shixin Feb. 3, 2023, 8:56 a.m. UTC | #6
On 2023/2/1 0:03, Ard Biesheuvel wrote:
> On Tue, 31 Jan 2023 at 16:07, Will Deacon <will@kernel.org> wrote:
>> Now really adding Ard...
>>
>> On Tue, Jan 31, 2023 at 03:06:44PM +0000, Will Deacon wrote:
>>> +Ard -- full thread here:
>>>
>>> https://lore.kernel.org/all/20221227092634.445212-1-liushixin2@huawei.com/
>>>
>>> On Sun, Jan 29, 2023 at 01:41:47PM -0800, Andrew Morton wrote:
>>>> On Sun, 29 Jan 2023 10:44:31 +0800 Liu Shixin <liushixin2@huawei.com> wrote:
>>>>
>>>>> Hi,
>>>>>
>>>>>
>>>>> This patch seems to have been lost in the corner. Recently I've meet this problem again
>>>>>
>>>>> on v6.1, so I would like to propose this patch again.
>>>>>
>>>>>
>>>>> Thanks,
>>>>>
>>>>>
>>>>> On 2022/12/27 17:26, Liu Shixin wrote:
>>>>>> After I add a 10GB pmem device, I got the following error message when
>>>>>> insert module:
>>>>>>
>>>>>>  insmod: vmalloc error: size 16384, vm_struct allocation failed,
>>>>>>  mode:0xcc0(GFP_KERNEL), nodemask=(null),cpuset=/,mems_allowed=0
>>>>>>
>>>>>> If CONFIG_RANDOMIZE_BASE is set, the module region can be located in the
>>>>>> vmalloc region entirely. Although module_alloc() can fall back to a 2GB
>>>>>> window if ARM64_MODULE_PLTS is set, the module region is still easily
>>>>>> exhausted because the module region is located at bottom of vmalloc region
>>>>>> and the vmalloc region is allocated from bottom to top.
>>>>>>
>>>>>> Skip module region if not calling from module_alloc().
>>>>>>
>>>> I'll assume this is for the arm tree.
>>>>
>>>> Acked-by: Andrew Morton <akpm@linux-foundation.org>
>>> This looks like the same issue previously reported at:
>>>
>>> https://lore.kernel.org/all/e6a804de-a5f7-c551-ffba-e09d04e438fc@hisilicon.com/
>>>
>>> where Ard had a few suggestions but, afaict, they didn't help.
>>>
> Thanks for the cc.
>
> So this is a bit clunky, and I wonder whether we wouldn't be better
> off just splitting the vmalloc region into two separate regions: one
> for the kernel and modules, and one for everything else. That way, we
> lose one bit of entropy in the randomized placement, but the default
> 48-bit VA space is vast anway, and even on 39-bit VA configs (such as
> Android), I seriously doubt that we come anywhere close to exhausting
> the vmalloc space today.
> .
Thanks for your advice.

>
Will Deacon Feb. 7, 2023, 11:29 a.m. UTC | #7
On Tue, Jan 31, 2023 at 05:03:32PM +0100, Ard Biesheuvel wrote:
> On Tue, 31 Jan 2023 at 16:07, Will Deacon <will@kernel.org> wrote:
> > On Tue, Jan 31, 2023 at 03:06:44PM +0000, Will Deacon wrote:
> > > On Sun, Jan 29, 2023 at 01:41:47PM -0800, Andrew Morton wrote:
> > > > On Sun, 29 Jan 2023 10:44:31 +0800 Liu Shixin <liushixin2@huawei.com> wrote:
> > > > > On 2022/12/27 17:26, Liu Shixin wrote:
> > > > > > After I add a 10GB pmem device, I got the following error message when
> > > > > > insert module:
> > > > > >
> > > > > >  insmod: vmalloc error: size 16384, vm_struct allocation failed,
> > > > > >  mode:0xcc0(GFP_KERNEL), nodemask=(null),cpuset=/,mems_allowed=0
> > > > > >
> > > > > > If CONFIG_RANDOMIZE_BASE is set, the module region can be located in the
> > > > > > vmalloc region entirely. Although module_alloc() can fall back to a 2GB
> > > > > > window if ARM64_MODULE_PLTS is set, the module region is still easily
> > > > > > exhausted because the module region is located at bottom of vmalloc region
> > > > > > and the vmalloc region is allocated from bottom to top.
> > > > > >
> > > > > > Skip module region if not calling from module_alloc().
> > > > > >
> > > >
> > > > I'll assume this is for the arm tree.
> > > >
> > > > Acked-by: Andrew Morton <akpm@linux-foundation.org>
> > >
> > > This looks like the same issue previously reported at:
> > >
> > > https://lore.kernel.org/all/e6a804de-a5f7-c551-ffba-e09d04e438fc@hisilicon.com/
> > >
> > > where Ard had a few suggestions but, afaict, they didn't help.
> > >
> 
> Thanks for the cc.
> 
> So this is a bit clunky, and I wonder whether we wouldn't be better
> off just splitting the vmalloc region into two separate regions: one
> for the kernel and modules, and one for everything else. That way, we
> lose one bit of entropy in the randomized placement, but the default
> 48-bit VA space is vast anway, and even on 39-bit VA configs (such as
> Android), I seriously doubt that we come anywhere close to exhausting
> the vmalloc space today.

That sounds like a good idea to me.

Liu Shixin -- do you think you could have a go at implementing Ard's
suggestion instead?

Cheers,

Will
Thorsten Leemhuis Feb. 27, 2023, 3:08 p.m. UTC | #8
[CCing the regression list, as it should be in the loop for regressions:
https://docs.kernel.org/admin-guide/reporting-regressions.html]

On 07.02.23 12:29, Will Deacon wrote:
> On Tue, Jan 31, 2023 at 05:03:32PM +0100, Ard Biesheuvel wrote:
>> On Tue, 31 Jan 2023 at 16:07, Will Deacon <will@kernel.org> wrote:
>>> On Tue, Jan 31, 2023 at 03:06:44PM +0000, Will Deacon wrote:
>>>> On Sun, Jan 29, 2023 at 01:41:47PM -0800, Andrew Morton wrote:
>>>>> On Sun, 29 Jan 2023 10:44:31 +0800 Liu Shixin <liushixin2@huawei.com> wrote:
>>>>>> On 2022/12/27 17:26, Liu Shixin wrote:
>>>>>>> After I add a 10GB pmem device, I got the following error message when
>>>>>>> insert module:
>>>>>>>
>>>>>>>  insmod: vmalloc error: size 16384, vm_struct allocation failed,
>>>>>>>  mode:0xcc0(GFP_KERNEL), nodemask=(null),cpuset=/,mems_allowed=0
>>>>>>>
>>>>>>> If CONFIG_RANDOMIZE_BASE is set, the module region can be located in the
>>>>>>> vmalloc region entirely. Although module_alloc() can fall back to a 2GB
>>>>>>> window if ARM64_MODULE_PLTS is set, the module region is still easily
>>>>>>> exhausted because the module region is located at bottom of vmalloc region
>>>>>>> and the vmalloc region is allocated from bottom to top.
>>>>>>>
>>>>>>> Skip module region if not calling from module_alloc().
>>>>>
>>>>> I'll assume this is for the arm tree.
>>>>>
>>>>> Acked-by: Andrew Morton <akpm@linux-foundation.org>
>>>>
>>>> This looks like the same issue previously reported at:
>>>>
>>>> https://lore.kernel.org/all/e6a804de-a5f7-c551-ffba-e09d04e438fc@hisilicon.com/
>>>>
>>>> where Ard had a few suggestions but, afaict, they didn't help.
>>>>
>>
>> Thanks for the cc.
>>
>> So this is a bit clunky, and I wonder whether we wouldn't be better
>> off just splitting the vmalloc region into two separate regions: one
>> for the kernel and modules, and one for everything else. That way, we
>> lose one bit of entropy in the randomized placement, but the default
>> 48-bit VA space is vast anway, and even on 39-bit VA configs (such as
>> Android), I seriously doubt that we come anywhere close to exhausting
>> the vmalloc space today.
> 
> That sounds like a good idea to me.
> 
> Liu Shixin -- do you think you could have a go at implementing Ard's
> suggestion instead?

Liu Shixin, did you ever look into realizing this idea?

Or was some progress already made and I just missed it?

I'm asking, as the idea discussed afaics is not only supposed to fix the
regression you tried to address, but also one that is now three months
old and stalled since Mid-December -- which is really unfortunate, as
that's not how regressions should be handled. :-/ But well, it afaik was
caused by a patch from Ard, so it's obviously not your job to address
it. But it seems you were working on it.

Ciao, Thorsten (wearing his 'the Linux kernel's regression tracker' hat)
--
Everything you wanna know about Linux kernel regression tracking:
https://linux-regtracking.leemhuis.info/about/#tldr
If I did something stupid, please tell me, as explained on that page.
Ard Biesheuvel Feb. 27, 2023, 4:14 p.m. UTC | #9
On Mon, 27 Feb 2023 at 16:08, Linux regression tracking (Thorsten
Leemhuis) <regressions@leemhuis.info> wrote:
>
> [CCing the regression list, as it should be in the loop for regressions:
> https://docs.kernel.org/admin-guide/reporting-regressions.html]
>
> On 07.02.23 12:29, Will Deacon wrote:
> > On Tue, Jan 31, 2023 at 05:03:32PM +0100, Ard Biesheuvel wrote:
> >> On Tue, 31 Jan 2023 at 16:07, Will Deacon <will@kernel.org> wrote:
> >>> On Tue, Jan 31, 2023 at 03:06:44PM +0000, Will Deacon wrote:
> >>>> On Sun, Jan 29, 2023 at 01:41:47PM -0800, Andrew Morton wrote:
> >>>>> On Sun, 29 Jan 2023 10:44:31 +0800 Liu Shixin <liushixin2@huawei.com> wrote:
> >>>>>> On 2022/12/27 17:26, Liu Shixin wrote:
> >>>>>>> After I add a 10GB pmem device, I got the following error message when
> >>>>>>> insert module:
> >>>>>>>
> >>>>>>>  insmod: vmalloc error: size 16384, vm_struct allocation failed,
> >>>>>>>  mode:0xcc0(GFP_KERNEL), nodemask=(null),cpuset=/,mems_allowed=0
> >>>>>>>
> >>>>>>> If CONFIG_RANDOMIZE_BASE is set, the module region can be located in the
> >>>>>>> vmalloc region entirely. Although module_alloc() can fall back to a 2GB
> >>>>>>> window if ARM64_MODULE_PLTS is set, the module region is still easily
> >>>>>>> exhausted because the module region is located at bottom of vmalloc region
> >>>>>>> and the vmalloc region is allocated from bottom to top.
> >>>>>>>
> >>>>>>> Skip module region if not calling from module_alloc().
> >>>>>
> >>>>> I'll assume this is for the arm tree.
> >>>>>
> >>>>> Acked-by: Andrew Morton <akpm@linux-foundation.org>
> >>>>
> >>>> This looks like the same issue previously reported at:
> >>>>
> >>>> https://lore.kernel.org/all/e6a804de-a5f7-c551-ffba-e09d04e438fc@hisilicon.com/
> >>>>
> >>>> where Ard had a few suggestions but, afaict, they didn't help.
> >>>>
> >>
> >> Thanks for the cc.
> >>
> >> So this is a bit clunky, and I wonder whether we wouldn't be better
> >> off just splitting the vmalloc region into two separate regions: one
> >> for the kernel and modules, and one for everything else. That way, we
> >> lose one bit of entropy in the randomized placement, but the default
> >> 48-bit VA space is vast anway, and even on 39-bit VA configs (such as
> >> Android), I seriously doubt that we come anywhere close to exhausting
> >> the vmalloc space today.
> >
> > That sounds like a good idea to me.
> >
> > Liu Shixin -- do you think you could have a go at implementing Ard's
> > suggestion instead?
>
> Liu Shixin, did you ever look into realizing this idea?
>
> Or was some progress already made and I just missed it?
>

This patch

https://lore.kernel.org/all/20230223204101.1500373-1-ardb@kernel.org/

should fix the issue.

> I'm asking, as the idea discussed afaics is not only supposed to fix the
> regression you tried to address, but also one that is now three months
> old and stalled since Mid-December -- which is really unfortunate, as
> that's not how regressions should be handled. :-/

Is it documented anywhere how regressions should be handled? The
mailing list is flooded with hard to reproduce reports from users as
well as automatic fuzzers and build bots, so I don't think it is
entirely unreasonable to move unresponsive reporters to the back of
the queue.

> But well, it afaik was
> caused by a patch from Ard, so it's obviously not your job to address
> it. But it seems you were working on it.
>

We are all working together here, so please refrain from telling
people what they should or should not be working on. (I am aware that
you probably did not mean it that way, but things tend to get lost in
translation very easily on the mailing list)

Liu, could you please check whether the linked patch addresses your issue?

Thanks,
Ard.
Thorsten Leemhuis Feb. 27, 2023, 5:17 p.m. UTC | #10
On 27.02.23 17:14, Ard Biesheuvel wrote:
> On Mon, 27 Feb 2023 at 16:08, Linux regression tracking (Thorsten
> Leemhuis) <regressions@leemhuis.info> wrote:
>>
>> [CCing the regression list, as it should be in the loop for regressions:
>> https://docs.kernel.org/admin-guide/reporting-regressions.html]
>>
>> On 07.02.23 12:29, Will Deacon wrote:
>>> On Tue, Jan 31, 2023 at 05:03:32PM +0100, Ard Biesheuvel wrote:
>>>> On Tue, 31 Jan 2023 at 16:07, Will Deacon <will@kernel.org> wrote:
>>>>> On Tue, Jan 31, 2023 at 03:06:44PM +0000, Will Deacon wrote:
>>>>>> On Sun, Jan 29, 2023 at 01:41:47PM -0800, Andrew Morton wrote:
>>>>>>> On Sun, 29 Jan 2023 10:44:31 +0800 Liu Shixin <liushixin2@huawei.com> wrote:
>>>>>>>> On 2022/12/27 17:26, Liu Shixin wrote:
>>>>>>>>> After I add a 10GB pmem device, I got the following error message when
>>>>>>>>> insert module:
>>>>>>>>>
>>>>>>>>>  insmod: vmalloc error: size 16384, vm_struct allocation failed,
>>>>>>>>>  mode:0xcc0(GFP_KERNEL), nodemask=(null),cpuset=/,mems_allowed=0
>>>>>>>>>
>>>>>>>>> If CONFIG_RANDOMIZE_BASE is set, the module region can be located in the
>>>>>>>>> vmalloc region entirely. Although module_alloc() can fall back to a 2GB
>>>>>>>>> window if ARM64_MODULE_PLTS is set, the module region is still easily
>>>>>>>>> exhausted because the module region is located at bottom of vmalloc region
>>>>>>>>> and the vmalloc region is allocated from bottom to top.
>>>>>>>>>
>>>>>>>>> Skip module region if not calling from module_alloc().
>>>>>>>
>>>>>>> I'll assume this is for the arm tree.
>>>>>>>
>>>>>>> Acked-by: Andrew Morton <akpm@linux-foundation.org>
>>>>>>
>>>>>> This looks like the same issue previously reported at:
>>>>>>
>>>>>> https://lore.kernel.org/all/e6a804de-a5f7-c551-ffba-e09d04e438fc@hisilicon.com/
>>>>>>
>>>>>> where Ard had a few suggestions but, afaict, they didn't help.
>>>>>>
>>>>
>>>> Thanks for the cc.
>>>>
>>>> So this is a bit clunky, and I wonder whether we wouldn't be better
>>>> off just splitting the vmalloc region into two separate regions: one
>>>> for the kernel and modules, and one for everything else. That way, we
>>>> lose one bit of entropy in the randomized placement, but the default
>>>> 48-bit VA space is vast anway, and even on 39-bit VA configs (such as
>>>> Android), I seriously doubt that we come anywhere close to exhausting
>>>> the vmalloc space today.
>>>
>>> That sounds like a good idea to me.
>>>
>>> Liu Shixin -- do you think you could have a go at implementing Ard's
>>> suggestion instead?
>>
>> Liu Shixin, did you ever look into realizing this idea?
>>
>> Or was some progress already made and I just missed it?
> 
> This patch
> 
> https://lore.kernel.org/all/20230223204101.1500373-1-ardb@kernel.org/
> 
> should fix the issue.

Great, many thx.

>> I'm asking, as the idea discussed afaics is not only supposed to fix the
>> regression you tried to address, but also one that is now three months
>> old and stalled since Mid-December -- which is really unfortunate, as
>> that's not how regressions should be handled. :-/
> 
> Is it documented anywhere how regressions should be handled?

https://docs.kernel.org/process/handling-regressions.html

Side note: that also mentions use of the "Link" tag. If the patch had
one, I'd noticed it and wouldn't bothered anyone here.

> The
> mailing list is flooded with hard to reproduce reports from users as
> well as automatic fuzzers and build bots, so I don't think it is
> entirely unreasonable to move unresponsive reporters to the back of
> the queue.

I do that sometimes, but fwiw, from what I can see it wasn't a reporter
that was unresponsive:

https://lore.kernel.org/linux-arm-kernel/c1ff5cae-7f56-7fdb-c832-ffbcc177535b@leemhuis.info/

But I might be missing something, sorry if I do. And there was the
festive season what complicated everything. Whatever, as long as this
this is fixed.

/me wonders if we should ask "chenxiang (M)" to test that patch, too;
but /me is not even totally sure it's the same problem

>> But well, it afaik was
>> caused by a patch from Ard, so it's obviously not your job to address
>> it. But it seems you were working on it.
> 
> We are all working together here, so please refrain from telling
> people what they should or should not be working on. (I am aware that
> you probably did not mean it that way, but things tend to get lost in
> translation very easily on the mailing list)

Maybe I found the wrong words, sorry.

Ciao, Thorsten (wearing his 'the Linux kernel's regression tracker' hat)
--
Everything you wanna know about Linux kernel regression tracking:
https://linux-regtracking.leemhuis.info/about/#tldr
If I did something stupid, please tell me, as explained on that page.
Ard Biesheuvel Feb. 27, 2023, 5:53 p.m. UTC | #11
On Mon, 27 Feb 2023 at 18:17, Linux regression tracking (Thorsten
Leemhuis) <regressions@leemhuis.info> wrote:
>
>
>
> On 27.02.23 17:14, Ard Biesheuvel wrote:
> > On Mon, 27 Feb 2023 at 16:08, Linux regression tracking (Thorsten
> > Leemhuis) <regressions@leemhuis.info> wrote:
> >>
> >> [CCing the regression list, as it should be in the loop for regressions:
> >> https://docs.kernel.org/admin-guide/reporting-regressions.html]
> >>
> >> On 07.02.23 12:29, Will Deacon wrote:
> >>> On Tue, Jan 31, 2023 at 05:03:32PM +0100, Ard Biesheuvel wrote:
> >>>> On Tue, 31 Jan 2023 at 16:07, Will Deacon <will@kernel.org> wrote:
> >>>>> On Tue, Jan 31, 2023 at 03:06:44PM +0000, Will Deacon wrote:
> >>>>>> On Sun, Jan 29, 2023 at 01:41:47PM -0800, Andrew Morton wrote:
> >>>>>>> On Sun, 29 Jan 2023 10:44:31 +0800 Liu Shixin <liushixin2@huawei.com> wrote:
> >>>>>>>> On 2022/12/27 17:26, Liu Shixin wrote:
> >>>>>>>>> After I add a 10GB pmem device, I got the following error message when
> >>>>>>>>> insert module:
> >>>>>>>>>
> >>>>>>>>>  insmod: vmalloc error: size 16384, vm_struct allocation failed,
> >>>>>>>>>  mode:0xcc0(GFP_KERNEL), nodemask=(null),cpuset=/,mems_allowed=0
> >>>>>>>>>
> >>>>>>>>> If CONFIG_RANDOMIZE_BASE is set, the module region can be located in the
> >>>>>>>>> vmalloc region entirely. Although module_alloc() can fall back to a 2GB
> >>>>>>>>> window if ARM64_MODULE_PLTS is set, the module region is still easily
> >>>>>>>>> exhausted because the module region is located at bottom of vmalloc region
> >>>>>>>>> and the vmalloc region is allocated from bottom to top.
> >>>>>>>>>
> >>>>>>>>> Skip module region if not calling from module_alloc().
> >>>>>>>
> >>>>>>> I'll assume this is for the arm tree.
> >>>>>>>
> >>>>>>> Acked-by: Andrew Morton <akpm@linux-foundation.org>
> >>>>>>
> >>>>>> This looks like the same issue previously reported at:
> >>>>>>
> >>>>>> https://lore.kernel.org/all/e6a804de-a5f7-c551-ffba-e09d04e438fc@hisilicon.com/
> >>>>>>
> >>>>>> where Ard had a few suggestions but, afaict, they didn't help.
> >>>>>>
> >>>>
> >>>> Thanks for the cc.
> >>>>
> >>>> So this is a bit clunky, and I wonder whether we wouldn't be better
> >>>> off just splitting the vmalloc region into two separate regions: one
> >>>> for the kernel and modules, and one for everything else. That way, we
> >>>> lose one bit of entropy in the randomized placement, but the default
> >>>> 48-bit VA space is vast anway, and even on 39-bit VA configs (such as
> >>>> Android), I seriously doubt that we come anywhere close to exhausting
> >>>> the vmalloc space today.
> >>>
> >>> That sounds like a good idea to me.
> >>>
> >>> Liu Shixin -- do you think you could have a go at implementing Ard's
> >>> suggestion instead?
> >>
> >> Liu Shixin, did you ever look into realizing this idea?
> >>
> >> Or was some progress already made and I just missed it?
> >
> > This patch
> >
> > https://lore.kernel.org/all/20230223204101.1500373-1-ardb@kernel.org/
> >
> > should fix the issue.
>
> Great, many thx.
>
> >> I'm asking, as the idea discussed afaics is not only supposed to fix the
> >> regression you tried to address, but also one that is now three months
> >> old and stalled since Mid-December -- which is really unfortunate, as
> >> that's not how regressions should be handled. :-/
> >
> > Is it documented anywhere how regressions should be handled?
>
> https://docs.kernel.org/process/handling-regressions.html
>
> Side note: that also mentions use of the "Link" tag. If the patch had
> one, I'd noticed it and wouldn't bothered anyone here.
>

Thanks for the reference. I wasn't aware that that document existed.

However, please be careful with calling everything a regression: in
this particular case, the 10G pmem device simply never worked in this
configuration, and so calling this a regression, and quoting all these
rules that we must now abide by is, quite frankly, not entirely
appropriate.

Can we please reserve the 'regression' label for cases where the
workflow of a real user stopped working after a kernel change? I agree
that this must never happen, and so we should prioritize those cases
over the ones that are just ordinary bugs and not regressions.

> > The
> > mailing list is flooded with hard to reproduce reports from users as
> > well as automatic fuzzers and build bots, so I don't think it is
> > entirely unreasonable to move unresponsive reporters to the back of
> > the queue.
>
> I do that sometimes, but fwiw, from what I can see it wasn't a reporter
> that was unresponsive:
>
> https://lore.kernel.org/linux-arm-kernel/c1ff5cae-7f56-7fdb-c832-ffbcc177535b@leemhuis.info/
>
> But I might be missing something, sorry if I do. And there was the
> festive season what complicated everything. Whatever, as long as this
> this is fixed.
>
> /me wonders if we should ask "chenxiang (M)" to test that patch, too;
> but /me is not even totally sure it's the same problem
>

Yes, to me it looks like exactly the same issue.

> >> But well, it afaik was
> >> caused by a patch from Ard, so it's obviously not your job to address
> >> it. But it seems you were working on it.
> >
> > We are all working together here, so please refrain from telling
> > people what they should or should not be working on. (I am aware that
> > you probably did not mean it that way, but things tend to get lost in
> > translation very easily on the mailing list)
>
> Maybe I found the wrong words, sorry.
>

No worries. Your work is much appreciated, as I am sure it's a
thankless job at times.

Thanks,
Ard.
Liu Shixin Feb. 28, 2023, 1:46 a.m. UTC | #12
On 2023/2/27 23:08, Linux regression tracking (Thorsten Leemhuis) wrote:
> [CCing the regression list, as it should be in the loop for regressions:
> https://docs.kernel.org/admin-guide/reporting-regressions.html]
>
> On 07.02.23 12:29, Will Deacon wrote:
>> On Tue, Jan 31, 2023 at 05:03:32PM +0100, Ard Biesheuvel wrote:
>>> On Tue, 31 Jan 2023 at 16:07, Will Deacon <will@kernel.org> wrote:
>>>> On Tue, Jan 31, 2023 at 03:06:44PM +0000, Will Deacon wrote:
>>>>> On Sun, Jan 29, 2023 at 01:41:47PM -0800, Andrew Morton wrote:
>>>>>> On Sun, 29 Jan 2023 10:44:31 +0800 Liu Shixin <liushixin2@huawei.com> wrote:
>>>>>>> On 2022/12/27 17:26, Liu Shixin wrote:
>>>>>>>> After I add a 10GB pmem device, I got the following error message when
>>>>>>>> insert module:
>>>>>>>>
>>>>>>>>  insmod: vmalloc error: size 16384, vm_struct allocation failed,
>>>>>>>>  mode:0xcc0(GFP_KERNEL), nodemask=(null),cpuset=/,mems_allowed=0
>>>>>>>>
>>>>>>>> If CONFIG_RANDOMIZE_BASE is set, the module region can be located in the
>>>>>>>> vmalloc region entirely. Although module_alloc() can fall back to a 2GB
>>>>>>>> window if ARM64_MODULE_PLTS is set, the module region is still easily
>>>>>>>> exhausted because the module region is located at bottom of vmalloc region
>>>>>>>> and the vmalloc region is allocated from bottom to top.
>>>>>>>>
>>>>>>>> Skip module region if not calling from module_alloc().
>>>>>> I'll assume this is for the arm tree.
>>>>>>
>>>>>> Acked-by: Andrew Morton <akpm@linux-foundation.org>
>>>>> This looks like the same issue previously reported at:
>>>>>
>>>>> https://lore.kernel.org/all/e6a804de-a5f7-c551-ffba-e09d04e438fc@hisilicon.com/
>>>>>
>>>>> where Ard had a few suggestions but, afaict, they didn't help.
>>>>>
>>> Thanks for the cc.
>>>
>>> So this is a bit clunky, and I wonder whether we wouldn't be better
>>> off just splitting the vmalloc region into two separate regions: one
>>> for the kernel and modules, and one for everything else. That way, we
>>> lose one bit of entropy in the randomized placement, but the default
>>> 48-bit VA space is vast anway, and even on 39-bit VA configs (such as
>>> Android), I seriously doubt that we come anywhere close to exhausting
>>> the vmalloc space today.
>> That sounds like a good idea to me.
>>
>> Liu Shixin -- do you think you could have a go at implementing Ard's
>> suggestion instead?
> Liu Shixin, did you ever look into realizing this idea?
This is in my work list, but I haven't implemented it yet. Sorry for the long delay.
> Or was some progress already made and I just missed it?
>
> I'm asking, as the idea discussed afaics is not only supposed to fix the
> regression you tried to address, but also one that is now three months
> old and stalled since Mid-December -- which is really unfortunate, as
> that's not how regressions should be handled. :-/ But well, it afaik was
> caused by a patch from Ard, so it's obviously not your job to address
> it. But it seems you were working on it.
>
> Ciao, Thorsten (wearing his 'the Linux kernel's regression tracker' hat)
> --
> Everything you wanna know about Linux kernel regression tracking:
> https://linux-regtracking.leemhuis.info/about/#tldr
> If I did something stupid, please tell me, as explained on that page.
>
> .
>
Liu Shixin Feb. 28, 2023, 1:48 a.m. UTC | #13
On 2023/2/28 0:14, Ard Biesheuvel wrote:
> On Mon, 27 Feb 2023 at 16:08, Linux regression tracking (Thorsten
> Leemhuis) <regressions@leemhuis.info> wrote:
>> [CCing the regression list, as it should be in the loop for regressions:
>> https://docs.kernel.org/admin-guide/reporting-regressions.html]
>>
>> On 07.02.23 12:29, Will Deacon wrote:
>>> On Tue, Jan 31, 2023 at 05:03:32PM +0100, Ard Biesheuvel wrote:
>>>> On Tue, 31 Jan 2023 at 16:07, Will Deacon <will@kernel.org> wrote:
>>>>> On Tue, Jan 31, 2023 at 03:06:44PM +0000, Will Deacon wrote:
>>>>>> On Sun, Jan 29, 2023 at 01:41:47PM -0800, Andrew Morton wrote:
>>>>>>> On Sun, 29 Jan 2023 10:44:31 +0800 Liu Shixin <liushixin2@huawei.com> wrote:
>>>>>>>> On 2022/12/27 17:26, Liu Shixin wrote:
>>>>>>>>> After I add a 10GB pmem device, I got the following error message when
>>>>>>>>> insert module:
>>>>>>>>>
>>>>>>>>>  insmod: vmalloc error: size 16384, vm_struct allocation failed,
>>>>>>>>>  mode:0xcc0(GFP_KERNEL), nodemask=(null),cpuset=/,mems_allowed=0
>>>>>>>>>
>>>>>>>>> If CONFIG_RANDOMIZE_BASE is set, the module region can be located in the
>>>>>>>>> vmalloc region entirely. Although module_alloc() can fall back to a 2GB
>>>>>>>>> window if ARM64_MODULE_PLTS is set, the module region is still easily
>>>>>>>>> exhausted because the module region is located at bottom of vmalloc region
>>>>>>>>> and the vmalloc region is allocated from bottom to top.
>>>>>>>>>
>>>>>>>>> Skip module region if not calling from module_alloc().
>>>>>>> I'll assume this is for the arm tree.
>>>>>>>
>>>>>>> Acked-by: Andrew Morton <akpm@linux-foundation.org>
>>>>>> This looks like the same issue previously reported at:
>>>>>>
>>>>>> https://lore.kernel.org/all/e6a804de-a5f7-c551-ffba-e09d04e438fc@hisilicon.com/
>>>>>>
>>>>>> where Ard had a few suggestions but, afaict, they didn't help.
>>>>>>
>>>> Thanks for the cc.
>>>>
>>>> So this is a bit clunky, and I wonder whether we wouldn't be better
>>>> off just splitting the vmalloc region into two separate regions: one
>>>> for the kernel and modules, and one for everything else. That way, we
>>>> lose one bit of entropy in the randomized placement, but the default
>>>> 48-bit VA space is vast anway, and even on 39-bit VA configs (such as
>>>> Android), I seriously doubt that we come anywhere close to exhausting
>>>> the vmalloc space today.
>>> That sounds like a good idea to me.
>>>
>>> Liu Shixin -- do you think you could have a go at implementing Ard's
>>> suggestion instead?
>> Liu Shixin, did you ever look into realizing this idea?
>>
>> Or was some progress already made and I just missed it?
>>
> This patch
>
> https://lore.kernel.org/all/20230223204101.1500373-1-ardb@kernel.org/
>
> should fix the issue.
>
>> I'm asking, as the idea discussed afaics is not only supposed to fix the
>> regression you tried to address, but also one that is now three months
>> old and stalled since Mid-December -- which is really unfortunate, as
>> that's not how regressions should be handled. :-/
> Is it documented anywhere how regressions should be handled? The
> mailing list is flooded with hard to reproduce reports from users as
> well as automatic fuzzers and build bots, so I don't think it is
> entirely unreasonable to move unresponsive reporters to the back of
> the queue.
>
>> But well, it afaik was
>> caused by a patch from Ard, so it's obviously not your job to address
>> it. But it seems you were working on it.
>>
> We are all working together here, so please refrain from telling
> people what they should or should not be working on. (I am aware that
> you probably did not mean it that way, but things tend to get lost in
> translation very easily on the mailing list)
>
> Liu, could you please check whether the linked patch addresses your issue?
Thanks, I will try this patch.
>
> Thanks,
> Ard.
> .
>
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/vmalloc.h b/arch/arm64/include/asm/vmalloc.h
index 38fafffe699f..4feff546b11b 100644
--- a/arch/arm64/include/asm/vmalloc.h
+++ b/arch/arm64/include/asm/vmalloc.h
@@ -31,4 +31,30 @@  static inline pgprot_t arch_vmap_pgprot_tagged(pgprot_t prot)
 	return pgprot_tagged(prot);
 }
 
+#ifdef CONFIG_RANDOMIZE_BASE
+extern u64 module_alloc_base;
+#define arch_vmap_skip_module_region arch_vmap_skip_module_region
+static inline void arch_vmap_skip_module_region(unsigned long *addr,
+						unsigned long vstart,
+						unsigned long size,
+						unsigned long align)
+{
+	u64 module_alloc_end = module_alloc_base + MODULES_VSIZE;
+
+	if (vstart == module_alloc_base)
+		return;
+
+	if (IS_ENABLED(CONFIG_KASAN_GENERIC) ||
+	    IS_ENABLED(CONFIG_KASAN_SW_TAGS))
+		/* don't exceed the static module region - see module_alloc() */
+		module_alloc_end = MODULES_END;
+
+	if ((module_alloc_base >= *addr + size) ||
+	    (module_alloc_end <= *addr))
+		return;
+
+	*addr = ALIGN(module_alloc_end, align);
+}
+#endif
+
 #endif /* _ASM_ARM64_VMALLOC_H */
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 096d48aa3437..55ef97325b84 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -122,6 +122,15 @@  static inline pgprot_t arch_vmap_pgprot_tagged(pgprot_t prot)
 }
 #endif
 
+#ifndef arch_vmap_skip_module_region
+static inline void arch_vmap_skip_module_region(unsigned long *addr,
+						unsigned long vstart,
+						unsigned long size,
+						unsigned long align)
+{
+}
+#endif
+
 /*
  *	Highlevel APIs for driver use
  */
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index ca71de7c9d77..c840d673052e 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1236,6 +1236,8 @@  is_within_this_va(struct vmap_area *va, unsigned long size,
 	else
 		nva_start_addr = ALIGN(vstart, align);
 
+	arch_vmap_skip_module_region(&nva_start_addr, vstart, size, align);
+
 	/* Can be overflowed due to big size or alignment. */
 	if (nva_start_addr + size < nva_start_addr ||
 			nva_start_addr < vstart)
@@ -1523,6 +1525,8 @@  __alloc_vmap_area(struct rb_root *root, struct list_head *head,
 	else
 		nva_start_addr = ALIGN(vstart, align);
 
+	arch_vmap_skip_module_region(&nva_start_addr, vstart, size, align);
+
 	/* Check the "vend" restriction. */
 	if (nva_start_addr + size > vend)
 		return vend;