diff mbox series

[v4,6/6] mm: secretmem: add ability to reserve memory at boot

Message ID 20200818141554.13945-7-rppt@kernel.org (mailing list archive)
State Superseded
Headers show
Series mm: introduce memfd_secret system call to create "secret" memory areas | expand

Commit Message

Mike Rapoport Aug. 18, 2020, 2:15 p.m. UTC
From: Mike Rapoport <rppt@linux.ibm.com>

Taking pages out from the direct map and bringing them back may create
undesired fragmentation and usage of the smaller pages in the direct
mapping of the physical memory.

This can be avoided if a significantly large area of the physical memory
would be reserved for secretmem purposes at boot time.

Add ability to reserve physical memory for secretmem at boot time using
"secretmem" kernel parameter and then use that reserved memory as a global
pool for secret memory needs.

Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
---
 mm/secretmem.c | 134 ++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 126 insertions(+), 8 deletions(-)

Comments

David Hildenbrand Aug. 19, 2020, 10:49 a.m. UTC | #1
On 18.08.20 16:15, Mike Rapoport wrote:
> From: Mike Rapoport <rppt@linux.ibm.com>
> 
> Taking pages out from the direct map and bringing them back may create
> undesired fragmentation and usage of the smaller pages in the direct
> mapping of the physical memory.
> 
> This can be avoided if a significantly large area of the physical memory
> would be reserved for secretmem purposes at boot time.
> 
> Add ability to reserve physical memory for secretmem at boot time using
> "secretmem" kernel parameter and then use that reserved memory as a global
> pool for secret memory needs.

Wouldn't something like CMA be the better fit? Just wondering. Then, the
memory can actually be reused for something else while not needed.

> 
> Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
> ---
>  mm/secretmem.c | 134 ++++++++++++++++++++++++++++++++++++++++++++++---
>  1 file changed, 126 insertions(+), 8 deletions(-)
> 
> diff --git a/mm/secretmem.c b/mm/secretmem.c
> index 333eb18fb483..54067ea62b2d 100644
> --- a/mm/secretmem.c
> +++ b/mm/secretmem.c
> @@ -14,6 +14,7 @@
>  #include <linux/pagemap.h>
>  #include <linux/genalloc.h>
>  #include <linux/syscalls.h>
> +#include <linux/memblock.h>
>  #include <linux/pseudo_fs.h>
>  #include <linux/set_memory.h>
>  #include <linux/sched/signal.h>
> @@ -45,6 +46,39 @@ struct secretmem_ctx {
>  	unsigned int mode;
>  };
>  
> +struct secretmem_pool {
> +	struct gen_pool *pool;
> +	unsigned long reserved_size;
> +	void *reserved;
> +};
> +
> +static struct secretmem_pool secretmem_pool;
> +
> +static struct page *secretmem_alloc_huge_page(gfp_t gfp)
> +{
> +	struct gen_pool *pool = secretmem_pool.pool;
> +	unsigned long addr = 0;
> +	struct page *page = NULL;
> +
> +	if (pool) {
> +		if (gen_pool_avail(pool) < PMD_SIZE)
> +			return NULL;
> +
> +		addr = gen_pool_alloc(pool, PMD_SIZE);
> +		if (!addr)
> +			return NULL;
> +
> +		page = virt_to_page(addr);
> +	} else {
> +		page = alloc_pages(gfp, PMD_PAGE_ORDER);
> +
> +		if (page)
> +			split_page(page, PMD_PAGE_ORDER);
> +	}
> +
> +	return page;
> +}
> +
>  static int secretmem_pool_increase(struct secretmem_ctx *ctx, gfp_t gfp)
>  {
>  	unsigned long nr_pages = (1 << PMD_PAGE_ORDER);
> @@ -53,12 +87,11 @@ static int secretmem_pool_increase(struct secretmem_ctx *ctx, gfp_t gfp)
>  	struct page *page;
>  	int err;
>  
> -	page = alloc_pages(gfp, PMD_PAGE_ORDER);
> +	page = secretmem_alloc_huge_page(gfp);
>  	if (!page)
>  		return -ENOMEM;
>  
>  	addr = (unsigned long)page_address(page);
> -	split_page(page, PMD_PAGE_ORDER);
>  
>  	err = gen_pool_add(pool, addr, PMD_SIZE, NUMA_NO_NODE);
>  	if (err) {
> @@ -267,11 +300,13 @@ SYSCALL_DEFINE1(memfd_secret, unsigned long, flags)
>  	return err;
>  }
>  
> -static void secretmem_cleanup_chunk(struct gen_pool *pool,
> -				    struct gen_pool_chunk *chunk, void *data)
> +static void secretmem_recycle_range(unsigned long start, unsigned long end)
> +{
> +	gen_pool_free(secretmem_pool.pool, start, PMD_SIZE);
> +}
> +
> +static void secretmem_release_range(unsigned long start, unsigned long end)
>  {
> -	unsigned long start = chunk->start_addr;
> -	unsigned long end = chunk->end_addr;
>  	unsigned long nr_pages, addr;
>  
>  	nr_pages = (end - start + 1) / PAGE_SIZE;
> @@ -281,6 +316,18 @@ static void secretmem_cleanup_chunk(struct gen_pool *pool,
>  		put_page(virt_to_page(addr));
>  }
>  
> +static void secretmem_cleanup_chunk(struct gen_pool *pool,
> +				    struct gen_pool_chunk *chunk, void *data)
> +{
> +	unsigned long start = chunk->start_addr;
> +	unsigned long end = chunk->end_addr;
> +
> +	if (secretmem_pool.pool)
> +		secretmem_recycle_range(start, end);
> +	else
> +		secretmem_release_range(start, end);
> +}
> +
>  static void secretmem_cleanup_pool(struct secretmem_ctx *ctx)
>  {
>  	struct gen_pool *pool = ctx->pool;
> @@ -320,14 +367,85 @@ static struct file_system_type secretmem_fs = {
>  	.kill_sb	= kill_anon_super,
>  };
>  
> +static int secretmem_reserved_mem_init(void)
> +{
> +	struct gen_pool *pool;
> +	struct page *page;
> +	void *addr;
> +	int err;
> +
> +	if (!secretmem_pool.reserved)
> +		return 0;
> +
> +	pool = gen_pool_create(PMD_SHIFT, NUMA_NO_NODE);
> +	if (!pool)
> +		return -ENOMEM;
> +
> +	err = gen_pool_add(pool, (unsigned long)secretmem_pool.reserved,
> +			   secretmem_pool.reserved_size, NUMA_NO_NODE);
> +	if (err)
> +		goto err_destroy_pool;
> +
> +	for (addr = secretmem_pool.reserved;
> +	     addr < secretmem_pool.reserved + secretmem_pool.reserved_size;
> +	     addr += PAGE_SIZE) {
> +		page = virt_to_page(addr);
> +		__ClearPageReserved(page);
> +		set_page_count(page, 1);
> +	}
> +
> +	secretmem_pool.pool = pool;
> +	page = virt_to_page(secretmem_pool.reserved);
> +	__kernel_map_pages(page, secretmem_pool.reserved_size / PAGE_SIZE, 0);
> +	return 0;
> +
> +err_destroy_pool:
> +	gen_pool_destroy(pool);
> +	return err;
> +}
> +
>  static int secretmem_init(void)
>  {
> -	int ret = 0;
> +	int ret;
> +
> +	ret = secretmem_reserved_mem_init();
> +	if (ret)
> +		return ret;
>  
>  	secretmem_mnt = kern_mount(&secretmem_fs);
> -	if (IS_ERR(secretmem_mnt))
> +	if (IS_ERR(secretmem_mnt)) {
> +		gen_pool_destroy(secretmem_pool.pool);
>  		ret = PTR_ERR(secretmem_mnt);
> +	}
>  
>  	return ret;
>  }
>  fs_initcall(secretmem_init);
> +
> +static int __init secretmem_setup(char *str)
> +{
> +	phys_addr_t align = PMD_SIZE;
> +	unsigned long reserved_size;
> +	void *reserved;
> +
> +	reserved_size = memparse(str, NULL);
> +	if (!reserved_size)
> +		return 0;
> +
> +	if (reserved_size * 2 > PUD_SIZE)
> +		align = PUD_SIZE;
> +
> +	reserved = memblock_alloc(reserved_size, align);
> +	if (!reserved) {
> +		pr_err("failed to reserve %lu bytes\n", secretmem_pool.reserved_size);
> +		return 0;
> +	}
> +
> +	secretmem_pool.reserved_size = reserved_size;
> +	secretmem_pool.reserved = reserved;
> +
> +	pr_info("reserved %luM\n", reserved_size >> 20);
> +
> +	return 1;
> +}
> +__setup("secretmem=", secretmem_setup);
>
Mike Rapoport Aug. 19, 2020, 11:53 a.m. UTC | #2
On Wed, Aug 19, 2020 at 12:49:05PM +0200, David Hildenbrand wrote:
> On 18.08.20 16:15, Mike Rapoport wrote:
> > From: Mike Rapoport <rppt@linux.ibm.com>
> > 
> > Taking pages out from the direct map and bringing them back may create
> > undesired fragmentation and usage of the smaller pages in the direct
> > mapping of the physical memory.
> > 
> > This can be avoided if a significantly large area of the physical memory
> > would be reserved for secretmem purposes at boot time.
> > 
> > Add ability to reserve physical memory for secretmem at boot time using
> > "secretmem" kernel parameter and then use that reserved memory as a global
> > pool for secret memory needs.
> 
> Wouldn't something like CMA be the better fit? Just wondering. Then, the
> memory can actually be reused for something else while not needed.

The memory allocated as secret is removed from the direct map and the
boot time reservation is intended to reduce direct map fragmentatioan
and to avoid splitting 1G pages there. So with CMA I'd still need to
allocate 1G chunks for this and once 1G page is dropped from the direct
map it still cannot be reused for anything else until it is freed.

I could use CMA to do the boot time reservation, but doing the
reservesion directly seemed simpler and more explicit to me.


> > Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
> > ---
> >  mm/secretmem.c | 134 ++++++++++++++++++++++++++++++++++++++++++++++---
> >  1 file changed, 126 insertions(+), 8 deletions(-)
> > 
> > diff --git a/mm/secretmem.c b/mm/secretmem.c
> > index 333eb18fb483..54067ea62b2d 100644
> > --- a/mm/secretmem.c
> > +++ b/mm/secretmem.c
> > @@ -14,6 +14,7 @@
> >  #include <linux/pagemap.h>
> >  #include <linux/genalloc.h>
> >  #include <linux/syscalls.h>
> > +#include <linux/memblock.h>
> >  #include <linux/pseudo_fs.h>
> >  #include <linux/set_memory.h>
> >  #include <linux/sched/signal.h>
> > @@ -45,6 +46,39 @@ struct secretmem_ctx {
> >  	unsigned int mode;
> >  };
> >  
> > +struct secretmem_pool {
> > +	struct gen_pool *pool;
> > +	unsigned long reserved_size;
> > +	void *reserved;
> > +};
> > +
> > +static struct secretmem_pool secretmem_pool;
> > +
> > +static struct page *secretmem_alloc_huge_page(gfp_t gfp)
> > +{
> > +	struct gen_pool *pool = secretmem_pool.pool;
> > +	unsigned long addr = 0;
> > +	struct page *page = NULL;
> > +
> > +	if (pool) {
> > +		if (gen_pool_avail(pool) < PMD_SIZE)
> > +			return NULL;
> > +
> > +		addr = gen_pool_alloc(pool, PMD_SIZE);
> > +		if (!addr)
> > +			return NULL;
> > +
> > +		page = virt_to_page(addr);
> > +	} else {
> > +		page = alloc_pages(gfp, PMD_PAGE_ORDER);
> > +
> > +		if (page)
> > +			split_page(page, PMD_PAGE_ORDER);
> > +	}
> > +
> > +	return page;
> > +}
> > +
> >  static int secretmem_pool_increase(struct secretmem_ctx *ctx, gfp_t gfp)
> >  {
> >  	unsigned long nr_pages = (1 << PMD_PAGE_ORDER);
> > @@ -53,12 +87,11 @@ static int secretmem_pool_increase(struct secretmem_ctx *ctx, gfp_t gfp)
> >  	struct page *page;
> >  	int err;
> >  
> > -	page = alloc_pages(gfp, PMD_PAGE_ORDER);
> > +	page = secretmem_alloc_huge_page(gfp);
> >  	if (!page)
> >  		return -ENOMEM;
> >  
> >  	addr = (unsigned long)page_address(page);
> > -	split_page(page, PMD_PAGE_ORDER);
> >  
> >  	err = gen_pool_add(pool, addr, PMD_SIZE, NUMA_NO_NODE);
> >  	if (err) {
> > @@ -267,11 +300,13 @@ SYSCALL_DEFINE1(memfd_secret, unsigned long, flags)
> >  	return err;
> >  }
> >  
> > -static void secretmem_cleanup_chunk(struct gen_pool *pool,
> > -				    struct gen_pool_chunk *chunk, void *data)
> > +static void secretmem_recycle_range(unsigned long start, unsigned long end)
> > +{
> > +	gen_pool_free(secretmem_pool.pool, start, PMD_SIZE);
> > +}
> > +
> > +static void secretmem_release_range(unsigned long start, unsigned long end)
> >  {
> > -	unsigned long start = chunk->start_addr;
> > -	unsigned long end = chunk->end_addr;
> >  	unsigned long nr_pages, addr;
> >  
> >  	nr_pages = (end - start + 1) / PAGE_SIZE;
> > @@ -281,6 +316,18 @@ static void secretmem_cleanup_chunk(struct gen_pool *pool,
> >  		put_page(virt_to_page(addr));
> >  }
> >  
> > +static void secretmem_cleanup_chunk(struct gen_pool *pool,
> > +				    struct gen_pool_chunk *chunk, void *data)
> > +{
> > +	unsigned long start = chunk->start_addr;
> > +	unsigned long end = chunk->end_addr;
> > +
> > +	if (secretmem_pool.pool)
> > +		secretmem_recycle_range(start, end);
> > +	else
> > +		secretmem_release_range(start, end);
> > +}
> > +
> >  static void secretmem_cleanup_pool(struct secretmem_ctx *ctx)
> >  {
> >  	struct gen_pool *pool = ctx->pool;
> > @@ -320,14 +367,85 @@ static struct file_system_type secretmem_fs = {
> >  	.kill_sb	= kill_anon_super,
> >  };
> >  
> > +static int secretmem_reserved_mem_init(void)
> > +{
> > +	struct gen_pool *pool;
> > +	struct page *page;
> > +	void *addr;
> > +	int err;
> > +
> > +	if (!secretmem_pool.reserved)
> > +		return 0;
> > +
> > +	pool = gen_pool_create(PMD_SHIFT, NUMA_NO_NODE);
> > +	if (!pool)
> > +		return -ENOMEM;
> > +
> > +	err = gen_pool_add(pool, (unsigned long)secretmem_pool.reserved,
> > +			   secretmem_pool.reserved_size, NUMA_NO_NODE);
> > +	if (err)
> > +		goto err_destroy_pool;
> > +
> > +	for (addr = secretmem_pool.reserved;
> > +	     addr < secretmem_pool.reserved + secretmem_pool.reserved_size;
> > +	     addr += PAGE_SIZE) {
> > +		page = virt_to_page(addr);
> > +		__ClearPageReserved(page);
> > +		set_page_count(page, 1);
> > +	}
> > +
> > +	secretmem_pool.pool = pool;
> > +	page = virt_to_page(secretmem_pool.reserved);
> > +	__kernel_map_pages(page, secretmem_pool.reserved_size / PAGE_SIZE, 0);
> > +	return 0;
> > +
> > +err_destroy_pool:
> > +	gen_pool_destroy(pool);
> > +	return err;
> > +}
> > +
> >  static int secretmem_init(void)
> >  {
> > -	int ret = 0;
> > +	int ret;
> > +
> > +	ret = secretmem_reserved_mem_init();
> > +	if (ret)
> > +		return ret;
> >  
> >  	secretmem_mnt = kern_mount(&secretmem_fs);
> > -	if (IS_ERR(secretmem_mnt))
> > +	if (IS_ERR(secretmem_mnt)) {
> > +		gen_pool_destroy(secretmem_pool.pool);
> >  		ret = PTR_ERR(secretmem_mnt);
> > +	}
> >  
> >  	return ret;
> >  }
> >  fs_initcall(secretmem_init);
> > +
> > +static int __init secretmem_setup(char *str)
> > +{
> > +	phys_addr_t align = PMD_SIZE;
> > +	unsigned long reserved_size;
> > +	void *reserved;
> > +
> > +	reserved_size = memparse(str, NULL);
> > +	if (!reserved_size)
> > +		return 0;
> > +
> > +	if (reserved_size * 2 > PUD_SIZE)
> > +		align = PUD_SIZE;
> > +
> > +	reserved = memblock_alloc(reserved_size, align);
> > +	if (!reserved) {
> > +		pr_err("failed to reserve %lu bytes\n", secretmem_pool.reserved_size);
> > +		return 0;
> > +	}
> > +
> > +	secretmem_pool.reserved_size = reserved_size;
> > +	secretmem_pool.reserved = reserved;
> > +
> > +	pr_info("reserved %luM\n", reserved_size >> 20);
> > +
> > +	return 1;
> > +}
> > +__setup("secretmem=", secretmem_setup);
> > 
> 
> 
> -- 
> Thanks,
> 
> David / dhildenb
>
David Hildenbrand Aug. 19, 2020, 12:10 p.m. UTC | #3
On 19.08.20 13:53, Mike Rapoport wrote:
> On Wed, Aug 19, 2020 at 12:49:05PM +0200, David Hildenbrand wrote:
>> On 18.08.20 16:15, Mike Rapoport wrote:
>>> From: Mike Rapoport <rppt@linux.ibm.com>
>>>
>>> Taking pages out from the direct map and bringing them back may create
>>> undesired fragmentation and usage of the smaller pages in the direct
>>> mapping of the physical memory.
>>>
>>> This can be avoided if a significantly large area of the physical memory
>>> would be reserved for secretmem purposes at boot time.
>>>
>>> Add ability to reserve physical memory for secretmem at boot time using
>>> "secretmem" kernel parameter and then use that reserved memory as a global
>>> pool for secret memory needs.
>>
>> Wouldn't something like CMA be the better fit? Just wondering. Then, the
>> memory can actually be reused for something else while not needed.
> 
> The memory allocated as secret is removed from the direct map and the
> boot time reservation is intended to reduce direct map fragmentatioan
> and to avoid splitting 1G pages there. So with CMA I'd still need to
> allocate 1G chunks for this and once 1G page is dropped from the direct
> map it still cannot be reused for anything else until it is freed.
> 
> I could use CMA to do the boot time reservation, but doing the
> reservesion directly seemed simpler and more explicit to me.

Well, using CMA would give you the possibility to let the memory be used
for other purposes until you decide it's the right time to take it +
remove the direct mapping etc.

I wonder if a sane approach would be to require to allocate a pool
during boot and only take pages ever from that pool. That would avoid
spilling many unmovable pages all over the place, locally limiting them
to your area here.
Mike Rapoport Aug. 19, 2020, 5:33 p.m. UTC | #4
On Wed, Aug 19, 2020 at 02:10:43PM +0200, David Hildenbrand wrote:
> On 19.08.20 13:53, Mike Rapoport wrote:
> > On Wed, Aug 19, 2020 at 12:49:05PM +0200, David Hildenbrand wrote:
> >> On 18.08.20 16:15, Mike Rapoport wrote:
> >>> From: Mike Rapoport <rppt@linux.ibm.com>
> >>>
> >>> Taking pages out from the direct map and bringing them back may create
> >>> undesired fragmentation and usage of the smaller pages in the direct
> >>> mapping of the physical memory.
> >>>
> >>> This can be avoided if a significantly large area of the physical memory
> >>> would be reserved for secretmem purposes at boot time.
> >>>
> >>> Add ability to reserve physical memory for secretmem at boot time using
> >>> "secretmem" kernel parameter and then use that reserved memory as a global
> >>> pool for secret memory needs.
> >>
> >> Wouldn't something like CMA be the better fit? Just wondering. Then, the
> >> memory can actually be reused for something else while not needed.
> > 
> > The memory allocated as secret is removed from the direct map and the
> > boot time reservation is intended to reduce direct map fragmentatioan
> > and to avoid splitting 1G pages there. So with CMA I'd still need to
> > allocate 1G chunks for this and once 1G page is dropped from the direct
> > map it still cannot be reused for anything else until it is freed.
> > 
> > I could use CMA to do the boot time reservation, but doing the
> > reservesion directly seemed simpler and more explicit to me.
> 
> Well, using CMA would give you the possibility to let the memory be used
> for other purposes until you decide it's the right time to take it +
> remove the direct mapping etc.

I still can't say I follow you here. If I reseve a CMA area as a pool
for secret memory 1G pages, it is still reserved and it still cannot be
used for other purposes, right?

> I wonder if a sane approach would be to require to allocate a pool
> during boot and only take pages ever from that pool. That would avoid
> spilling many unmovable pages all over the place, locally limiting them
> to your area here.

That's what I tried to implement. The pool reserved at boot time is in a
way similar to booting with mem=X and then splitting the remaining
memory between the VMs.
In this case, the memory reserved at boot is never in the direct map and
allocations from such pool will not cause fragmentation.

> -- 
> Thanks,
> 
> David / dhildenb
>
David Hildenbrand Aug. 19, 2020, 5:45 p.m. UTC | #5
On 19.08.20 19:33, Mike Rapoport wrote:
> On Wed, Aug 19, 2020 at 02:10:43PM +0200, David Hildenbrand wrote:
>> On 19.08.20 13:53, Mike Rapoport wrote:
>>> On Wed, Aug 19, 2020 at 12:49:05PM +0200, David Hildenbrand wrote:
>>>> On 18.08.20 16:15, Mike Rapoport wrote:
>>>>> From: Mike Rapoport <rppt@linux.ibm.com>
>>>>>
>>>>> Taking pages out from the direct map and bringing them back may create
>>>>> undesired fragmentation and usage of the smaller pages in the direct
>>>>> mapping of the physical memory.
>>>>>
>>>>> This can be avoided if a significantly large area of the physical memory
>>>>> would be reserved for secretmem purposes at boot time.
>>>>>
>>>>> Add ability to reserve physical memory for secretmem at boot time using
>>>>> "secretmem" kernel parameter and then use that reserved memory as a global
>>>>> pool for secret memory needs.
>>>>
>>>> Wouldn't something like CMA be the better fit? Just wondering. Then, the
>>>> memory can actually be reused for something else while not needed.
>>>
>>> The memory allocated as secret is removed from the direct map and the
>>> boot time reservation is intended to reduce direct map fragmentatioan
>>> and to avoid splitting 1G pages there. So with CMA I'd still need to
>>> allocate 1G chunks for this and once 1G page is dropped from the direct
>>> map it still cannot be reused for anything else until it is freed.
>>>
>>> I could use CMA to do the boot time reservation, but doing the
>>> reservesion directly seemed simpler and more explicit to me.
>>
>> Well, using CMA would give you the possibility to let the memory be used
>> for other purposes until you decide it's the right time to take it +
>> remove the direct mapping etc.
> 
> I still can't say I follow you here. If I reseve a CMA area as a pool
> for secret memory 1G pages, it is still reserved and it still cannot be
> used for other purposes, right?

So, AFAIK, if you create a CMA pool it can be used for any MOVABLE
allocations (similar to ZONE_MOVABLE) until you actually allocate CMA
memory from that region. Other allocations on that are will then be
migrated away (using alloc_contig_range()).

For example, if you have a 1~GiB CMA area, you could allocate 4~MB pages
from that CMA area on demand (removing the direct mapping, etc ..), and
free when no longer needed (instantiating the direct mapping). The free
memory in that area could used for MOVABLE allocations.

Please let me know if I am missing something important.
Mike Rapoport Aug. 20, 2020, 3:52 p.m. UTC | #6
On Wed, Aug 19, 2020 at 07:45:29PM +0200, David Hildenbrand wrote:
> On 19.08.20 19:33, Mike Rapoport wrote:
> > On Wed, Aug 19, 2020 at 02:10:43PM +0200, David Hildenbrand wrote:
> >> On 19.08.20 13:53, Mike Rapoport wrote:
> >>> On Wed, Aug 19, 2020 at 12:49:05PM +0200, David Hildenbrand wrote:
> >>>> On 18.08.20 16:15, Mike Rapoport wrote:
> >>>>> From: Mike Rapoport <rppt@linux.ibm.com>
> >>>>>
> >>>>> Taking pages out from the direct map and bringing them back may create
> >>>>> undesired fragmentation and usage of the smaller pages in the direct
> >>>>> mapping of the physical memory.
> >>>>>
> >>>>> This can be avoided if a significantly large area of the physical memory
> >>>>> would be reserved for secretmem purposes at boot time.
> >>>>>
> >>>>> Add ability to reserve physical memory for secretmem at boot time using
> >>>>> "secretmem" kernel parameter and then use that reserved memory as a global
> >>>>> pool for secret memory needs.
> >>>>
> >>>> Wouldn't something like CMA be the better fit? Just wondering. Then, the
> >>>> memory can actually be reused for something else while not needed.
> >>>
> >>> The memory allocated as secret is removed from the direct map and the
> >>> boot time reservation is intended to reduce direct map fragmentatioan
> >>> and to avoid splitting 1G pages there. So with CMA I'd still need to
> >>> allocate 1G chunks for this and once 1G page is dropped from the direct
> >>> map it still cannot be reused for anything else until it is freed.
> >>>
> >>> I could use CMA to do the boot time reservation, but doing the
> >>> reservesion directly seemed simpler and more explicit to me.
> >>
> >> Well, using CMA would give you the possibility to let the memory be used
> >> for other purposes until you decide it's the right time to take it +
> >> remove the direct mapping etc.
> > 
> > I still can't say I follow you here. If I reseve a CMA area as a pool
> > for secret memory 1G pages, it is still reserved and it still cannot be
> > used for other purposes, right?
> 
> So, AFAIK, if you create a CMA pool it can be used for any MOVABLE
> allocations (similar to ZONE_MOVABLE) until you actually allocate CMA
> memory from that region. Other allocations on that are will then be
> migrated away (using alloc_contig_range()).
> 
> For example, if you have a 1~GiB CMA area, you could allocate 4~MB pages
> from that CMA area on demand (removing the direct mapping, etc ..), and
> free when no longer needed (instantiating the direct mapping). The free
> memory in that area could used for MOVABLE allocations.

The boot time resrvation is intended to avoid splitting 1G pages in the
direct map. Without the boot time reservation, we maintain a pool of 2M
pages so the 1G pages are split and 2M pages remain unsplit.

If I scale your example to match the requirement to avoid splitting 1G
pages in the direct map, that would mean creating a CMA area of several
tens of gigabytes and then doing cma_alloc() of 1G each time we need to
refill the secretmem pool. 

It is quite probable that we won't be able to get 1G from CMA after the
system worked for some time.

With boot time reservation we won't need physcally contiguous 1G to
satisfy smaller allocation requests for secretmem because we don't need
to maintain 1G mappings in the secretmem pool.

That said, I believe the addition of the boot time reservation, either
direct or with CMA, can be added as an incrememntal patch after the
"core" functionality is merged.

> Please let me know if I am missing something important.
> 
> -- 
> Thanks,
> 
> David / dhildenb
>
David Hildenbrand Sept. 8, 2020, 9:09 a.m. UTC | #7
On 20.08.20 17:52, Mike Rapoport wrote:
> On Wed, Aug 19, 2020 at 07:45:29PM +0200, David Hildenbrand wrote:
>> On 19.08.20 19:33, Mike Rapoport wrote:
>>> On Wed, Aug 19, 2020 at 02:10:43PM +0200, David Hildenbrand wrote:
>>>> On 19.08.20 13:53, Mike Rapoport wrote:
>>>>> On Wed, Aug 19, 2020 at 12:49:05PM +0200, David Hildenbrand wrote:
>>>>>> On 18.08.20 16:15, Mike Rapoport wrote:
>>>>>>> From: Mike Rapoport <rppt@linux.ibm.com>
>>>>>>>
>>>>>>> Taking pages out from the direct map and bringing them back may create
>>>>>>> undesired fragmentation and usage of the smaller pages in the direct
>>>>>>> mapping of the physical memory.
>>>>>>>
>>>>>>> This can be avoided if a significantly large area of the physical memory
>>>>>>> would be reserved for secretmem purposes at boot time.
>>>>>>>
>>>>>>> Add ability to reserve physical memory for secretmem at boot time using
>>>>>>> "secretmem" kernel parameter and then use that reserved memory as a global
>>>>>>> pool for secret memory needs.
>>>>>>
>>>>>> Wouldn't something like CMA be the better fit? Just wondering. Then, the
>>>>>> memory can actually be reused for something else while not needed.
>>>>>
>>>>> The memory allocated as secret is removed from the direct map and the
>>>>> boot time reservation is intended to reduce direct map fragmentatioan
>>>>> and to avoid splitting 1G pages there. So with CMA I'd still need to
>>>>> allocate 1G chunks for this and once 1G page is dropped from the direct
>>>>> map it still cannot be reused for anything else until it is freed.
>>>>>
>>>>> I could use CMA to do the boot time reservation, but doing the
>>>>> reservesion directly seemed simpler and more explicit to me.
>>>>
>>>> Well, using CMA would give you the possibility to let the memory be used
>>>> for other purposes until you decide it's the right time to take it +
>>>> remove the direct mapping etc.
>>>
>>> I still can't say I follow you here. If I reseve a CMA area as a pool
>>> for secret memory 1G pages, it is still reserved and it still cannot be
>>> used for other purposes, right?
>>
>> So, AFAIK, if you create a CMA pool it can be used for any MOVABLE
>> allocations (similar to ZONE_MOVABLE) until you actually allocate CMA
>> memory from that region. Other allocations on that are will then be
>> migrated away (using alloc_contig_range()).
>>
>> For example, if you have a 1~GiB CMA area, you could allocate 4~MB pages
>> from that CMA area on demand (removing the direct mapping, etc ..), and
>> free when no longer needed (instantiating the direct mapping). The free
>> memory in that area could used for MOVABLE allocations.
> 
> The boot time resrvation is intended to avoid splitting 1G pages in the
> direct map. Without the boot time reservation, we maintain a pool of 2M
> pages so the 1G pages are split and 2M pages remain unsplit.
> 
> If I scale your example to match the requirement to avoid splitting 1G
> pages in the direct map, that would mean creating a CMA area of several
> tens of gigabytes and then doing cma_alloc() of 1G each time we need to
> refill the secretmem pool. 
> 
> It is quite probable that we won't be able to get 1G from CMA after the
> system worked for some time.

Why? It should only contain movable pages, and if that is not the case,
it's a bug we have to fix. It should behave just as ZONE_MOVABLE.
(although I agree that in corner cases, alloc_contig_pages() might
temporarily fail on some chunks - e.g., with long/short-term page
pinnings - in contrast to memory offlining, it won't retry forever)

> 
> With boot time reservation we won't need physcally contiguous 1G to
> satisfy smaller allocation requests for secretmem because we don't need
> to maintain 1G mappings in the secretmem pool.

You can allocate within your CMA area however you want - doesn't need to
be whole gigabytes in case there is no need for it.

Again, the big benefit of CMA is that the reserved memory can be reused
for other purpose while nobody is actually making use of it.

> 
> That said, I believe the addition of the boot time reservation, either
> direct or with CMA, can be added as an incrememntal patch after the
> "core" functionality is merged.

I am not convinced that we want to let random processes to do
alloc_pages() in the range of tens of gigabytes. It's not just mlocked
memory. I prefer either using CMA or relying on the boot time
reservations. But let's see if there are other opinions and people just
don't care.

Having that said, I have no further comments.
Mike Rapoport Sept. 8, 2020, 12:31 p.m. UTC | #8
Hi David,

On Tue, Sep 08, 2020 at 11:09:19AM +0200, David Hildenbrand wrote:
> On 20.08.20 17:52, Mike Rapoport wrote:
> > On Wed, Aug 19, 2020 at 07:45:29PM +0200, David Hildenbrand wrote:
> >> On 19.08.20 19:33, Mike Rapoport wrote:
> >>> On Wed, Aug 19, 2020 at 02:10:43PM +0200, David Hildenbrand wrote:
> >>>> On 19.08.20 13:53, Mike Rapoport wrote:
> >>>>> On Wed, Aug 19, 2020 at 12:49:05PM +0200, David Hildenbrand wrote:
> >>>>>> On 18.08.20 16:15, Mike Rapoport wrote:
> >>>>>>> From: Mike Rapoport <rppt@linux.ibm.com>
> >>>>>>>
> >>>>>>> Taking pages out from the direct map and bringing them back may create
> >>>>>>> undesired fragmentation and usage of the smaller pages in the direct
> >>>>>>> mapping of the physical memory.
> >>>>>>>
> >>>>>>> This can be avoided if a significantly large area of the physical memory
> >>>>>>> would be reserved for secretmem purposes at boot time.
> >>>>>>>
> >>>>>>> Add ability to reserve physical memory for secretmem at boot time using
> >>>>>>> "secretmem" kernel parameter and then use that reserved memory as a global
> >>>>>>> pool for secret memory needs.
> >>>>>>
> >>>>>> Wouldn't something like CMA be the better fit? Just wondering. Then, the
> >>>>>> memory can actually be reused for something else while not needed.
> >>>>>
> >>>>> The memory allocated as secret is removed from the direct map and the
> >>>>> boot time reservation is intended to reduce direct map fragmentatioan
> >>>>> and to avoid splitting 1G pages there. So with CMA I'd still need to
> >>>>> allocate 1G chunks for this and once 1G page is dropped from the direct
> >>>>> map it still cannot be reused for anything else until it is freed.
> >>>>>
> >>>>> I could use CMA to do the boot time reservation, but doing the
> >>>>> reservesion directly seemed simpler and more explicit to me.
> >>>>
> >>>> Well, using CMA would give you the possibility to let the memory be used
> >>>> for other purposes until you decide it's the right time to take it +
> >>>> remove the direct mapping etc.
> >>>
> >>> I still can't say I follow you here. If I reseve a CMA area as a pool
> >>> for secret memory 1G pages, it is still reserved and it still cannot be
> >>> used for other purposes, right?
> >>
> >> So, AFAIK, if you create a CMA pool it can be used for any MOVABLE
> >> allocations (similar to ZONE_MOVABLE) until you actually allocate CMA
> >> memory from that region. Other allocations on that are will then be
> >> migrated away (using alloc_contig_range()).
> >>
> >> For example, if you have a 1~GiB CMA area, you could allocate 4~MB pages
> >> from that CMA area on demand (removing the direct mapping, etc ..), and
> >> free when no longer needed (instantiating the direct mapping). The free
> >> memory in that area could used for MOVABLE allocations.
> > 
> > The boot time resrvation is intended to avoid splitting 1G pages in the
> > direct map. Without the boot time reservation, we maintain a pool of 2M
> > pages so the 1G pages are split and 2M pages remain unsplit.
> > 
> > If I scale your example to match the requirement to avoid splitting 1G
> > pages in the direct map, that would mean creating a CMA area of several
> > tens of gigabytes and then doing cma_alloc() of 1G each time we need to
> > refill the secretmem pool. 
> > 
> > It is quite probable that we won't be able to get 1G from CMA after the
> > system worked for some time.
> 
> Why? It should only contain movable pages, and if that is not the case,
> it's a bug we have to fix. It should behave just as ZONE_MOVABLE.
> (although I agree that in corner cases, alloc_contig_pages() might
> temporarily fail on some chunks - e.g., with long/short-term page
> pinnings - in contrast to memory offlining, it won't retry forever)
 
The use-case I had in mind for the boot time reservation in secretmem is
a machine that runs VMs and there is a desire to have the VM memory
protected from the host. In a way this should be similar to booting a
host with mem=X where most of the machine memory never gets to be used
by the host kernel.

For such use case, boot time reservation controlled by the command
line parameter seems to me simpler than using CMA. I agree that there is
no way to use the reserved memory for other purpose, but then we won't
need to create physically contiguous chunk of several gigs every time a
VM is created.

> > With boot time reservation we won't need physcally contiguous 1G to
> > satisfy smaller allocation requests for secretmem because we don't need
> > to maintain 1G mappings in the secretmem pool.
> 
> You can allocate within your CMA area however you want - doesn't need to
> be whole gigabytes in case there is no need for it.

The whole point of boot time reservation is to prevent splitting 1G
pages in the direct map. Allocating smaller chunks will still cause
fragmentation of the direct map.

> Again, the big benefit of CMA is that the reserved memory can be reused
> for other purpose while nobody is actually making use of it.

Right, but I think if a user explicitly asked to use X gigabytes for the
secretmem we can allow that.

> > 
> > That said, I believe the addition of the boot time reservation, either
> > direct or with CMA, can be added as an incrememntal patch after the
> > "core" functionality is merged.
> 
> I am not convinced that we want to let random processes to do
> alloc_pages() in the range of tens of gigabytes. It's not just mlocked
> memory. I prefer either using CMA or relying on the boot time
> reservations. But let's see if there are other opinions and people just
> don't care.
> 
> Having that said, I have no further comments.
> 
> -- 
> Thanks,
> 
> David / dhildenb
>
diff mbox series

Patch

diff --git a/mm/secretmem.c b/mm/secretmem.c
index 333eb18fb483..54067ea62b2d 100644
--- a/mm/secretmem.c
+++ b/mm/secretmem.c
@@ -14,6 +14,7 @@ 
 #include <linux/pagemap.h>
 #include <linux/genalloc.h>
 #include <linux/syscalls.h>
+#include <linux/memblock.h>
 #include <linux/pseudo_fs.h>
 #include <linux/set_memory.h>
 #include <linux/sched/signal.h>
@@ -45,6 +46,39 @@  struct secretmem_ctx {
 	unsigned int mode;
 };
 
+struct secretmem_pool {
+	struct gen_pool *pool;
+	unsigned long reserved_size;
+	void *reserved;
+};
+
+static struct secretmem_pool secretmem_pool;
+
+static struct page *secretmem_alloc_huge_page(gfp_t gfp)
+{
+	struct gen_pool *pool = secretmem_pool.pool;
+	unsigned long addr = 0;
+	struct page *page = NULL;
+
+	if (pool) {
+		if (gen_pool_avail(pool) < PMD_SIZE)
+			return NULL;
+
+		addr = gen_pool_alloc(pool, PMD_SIZE);
+		if (!addr)
+			return NULL;
+
+		page = virt_to_page(addr);
+	} else {
+		page = alloc_pages(gfp, PMD_PAGE_ORDER);
+
+		if (page)
+			split_page(page, PMD_PAGE_ORDER);
+	}
+
+	return page;
+}
+
 static int secretmem_pool_increase(struct secretmem_ctx *ctx, gfp_t gfp)
 {
 	unsigned long nr_pages = (1 << PMD_PAGE_ORDER);
@@ -53,12 +87,11 @@  static int secretmem_pool_increase(struct secretmem_ctx *ctx, gfp_t gfp)
 	struct page *page;
 	int err;
 
-	page = alloc_pages(gfp, PMD_PAGE_ORDER);
+	page = secretmem_alloc_huge_page(gfp);
 	if (!page)
 		return -ENOMEM;
 
 	addr = (unsigned long)page_address(page);
-	split_page(page, PMD_PAGE_ORDER);
 
 	err = gen_pool_add(pool, addr, PMD_SIZE, NUMA_NO_NODE);
 	if (err) {
@@ -267,11 +300,13 @@  SYSCALL_DEFINE1(memfd_secret, unsigned long, flags)
 	return err;
 }
 
-static void secretmem_cleanup_chunk(struct gen_pool *pool,
-				    struct gen_pool_chunk *chunk, void *data)
+static void secretmem_recycle_range(unsigned long start, unsigned long end)
+{
+	gen_pool_free(secretmem_pool.pool, start, PMD_SIZE);
+}
+
+static void secretmem_release_range(unsigned long start, unsigned long end)
 {
-	unsigned long start = chunk->start_addr;
-	unsigned long end = chunk->end_addr;
 	unsigned long nr_pages, addr;
 
 	nr_pages = (end - start + 1) / PAGE_SIZE;
@@ -281,6 +316,18 @@  static void secretmem_cleanup_chunk(struct gen_pool *pool,
 		put_page(virt_to_page(addr));
 }
 
+static void secretmem_cleanup_chunk(struct gen_pool *pool,
+				    struct gen_pool_chunk *chunk, void *data)
+{
+	unsigned long start = chunk->start_addr;
+	unsigned long end = chunk->end_addr;
+
+	if (secretmem_pool.pool)
+		secretmem_recycle_range(start, end);
+	else
+		secretmem_release_range(start, end);
+}
+
 static void secretmem_cleanup_pool(struct secretmem_ctx *ctx)
 {
 	struct gen_pool *pool = ctx->pool;
@@ -320,14 +367,85 @@  static struct file_system_type secretmem_fs = {
 	.kill_sb	= kill_anon_super,
 };
 
+static int secretmem_reserved_mem_init(void)
+{
+	struct gen_pool *pool;
+	struct page *page;
+	void *addr;
+	int err;
+
+	if (!secretmem_pool.reserved)
+		return 0;
+
+	pool = gen_pool_create(PMD_SHIFT, NUMA_NO_NODE);
+	if (!pool)
+		return -ENOMEM;
+
+	err = gen_pool_add(pool, (unsigned long)secretmem_pool.reserved,
+			   secretmem_pool.reserved_size, NUMA_NO_NODE);
+	if (err)
+		goto err_destroy_pool;
+
+	for (addr = secretmem_pool.reserved;
+	     addr < secretmem_pool.reserved + secretmem_pool.reserved_size;
+	     addr += PAGE_SIZE) {
+		page = virt_to_page(addr);
+		__ClearPageReserved(page);
+		set_page_count(page, 1);
+	}
+
+	secretmem_pool.pool = pool;
+	page = virt_to_page(secretmem_pool.reserved);
+	__kernel_map_pages(page, secretmem_pool.reserved_size / PAGE_SIZE, 0);
+	return 0;
+
+err_destroy_pool:
+	gen_pool_destroy(pool);
+	return err;
+}
+
 static int secretmem_init(void)
 {
-	int ret = 0;
+	int ret;
+
+	ret = secretmem_reserved_mem_init();
+	if (ret)
+		return ret;
 
 	secretmem_mnt = kern_mount(&secretmem_fs);
-	if (IS_ERR(secretmem_mnt))
+	if (IS_ERR(secretmem_mnt)) {
+		gen_pool_destroy(secretmem_pool.pool);
 		ret = PTR_ERR(secretmem_mnt);
+	}
 
 	return ret;
 }
 fs_initcall(secretmem_init);
+
+static int __init secretmem_setup(char *str)
+{
+	phys_addr_t align = PMD_SIZE;
+	unsigned long reserved_size;
+	void *reserved;
+
+	reserved_size = memparse(str, NULL);
+	if (!reserved_size)
+		return 0;
+
+	if (reserved_size * 2 > PUD_SIZE)
+		align = PUD_SIZE;
+
+	reserved = memblock_alloc(reserved_size, align);
+	if (!reserved) {
+		pr_err("failed to reserve %lu bytes\n", secretmem_pool.reserved_size);
+		return 0;
+	}
+
+	secretmem_pool.reserved_size = reserved_size;
+	secretmem_pool.reserved = reserved;
+
+	pr_info("reserved %luM\n", reserved_size >> 20);
+
+	return 1;
+}
+__setup("secretmem=", secretmem_setup);