diff mbox series

[1/2] mm/memblock: Add "reserve_mem" to reserved named memory at boot up

Message ID 20240603233631.452433539@goodmis.org (mailing list archive)
State Handled Elsewhere
Headers show
Series mm/pstore: Reserve named unspecified memory across boots | expand

Commit Message

Steven Rostedt June 3, 2024, 11:33 p.m. UTC
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>

In order to allow for requesting a memory region that can be used for
things like pstore on multiple machines where the memory layout is not the
same, add a new option to the kernel command line called "reserve_mem".

The format is:  reserve_mem=nn:align:name

Where it will find nn amount of memory at the given alignment of align.
The name field is to allow another subsystem to retrieve where the memory
was found. For example:

  reserve_mem=12M:4096:oops ramoops.mem_name=oops

Where ramoops.mem_name will tell ramoops that memory was reserved for it
via the reserve_mem option and it can find it by calling:

  if (reserve_mem_find_by_name("oops", &start, &size)) {
	// start holds the start address and size holds the size given

Link: https://lore.kernel.org/all/ZjJVnZUX3NZiGW6q@kernel.org/

Suggested-by: Mike Rapoport <rppt@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/mm.h |  2 +
 mm/memblock.c      | 97 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 99 insertions(+)

Comments

Kees Cook June 4, 2024, 5:52 a.m. UTC | #1
On June 3, 2024 4:33:31 PM PDT, Steven Rostedt <rostedt@goodmis.org> wrote:
>From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
>
>In order to allow for requesting a memory region that can be used for
>things like pstore on multiple machines where the memory layout is not the
>same, add a new option to the kernel command line called "reserve_mem".
>
>The format is:  reserve_mem=nn:align:name
>
>Where it will find nn amount of memory at the given alignment of align.
>The name field is to allow another subsystem to retrieve where the memory
>was found. For example:
>
>  reserve_mem=12M:4096:oops ramoops.mem_name=oops

How does this interact with KASLR? It has chosen its physical location before this parsing happens, so I'd expect this to fail once in a while, unless the size/alignment is lucky enough that KASLR never uses that portion of the physical memory...

-Kees

>
>Where ramoops.mem_name will tell ramoops that memory was reserved for it
>via the reserve_mem option and it can find it by calling:
>
>  if (reserve_mem_find_by_name("oops", &start, &size)) {
>	// start holds the start address and size holds the size given
>
>Link: https://lore.kernel.org/all/ZjJVnZUX3NZiGW6q@kernel.org/
>
>Suggested-by: Mike Rapoport <rppt@kernel.org>
>Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
>---
> include/linux/mm.h |  2 +
> mm/memblock.c      | 97 ++++++++++++++++++++++++++++++++++++++++++++++
> 2 files changed, 99 insertions(+)
>
>diff --git a/include/linux/mm.h b/include/linux/mm.h
>index 9849dfda44d4..b4455cc02f2c 100644
>--- a/include/linux/mm.h
>+++ b/include/linux/mm.h
>@@ -4263,4 +4263,6 @@ static inline bool pfn_is_unaccepted_memory(unsigned long pfn)
> void vma_pgtable_walk_begin(struct vm_area_struct *vma);
> void vma_pgtable_walk_end(struct vm_area_struct *vma);
> 
>+int reserve_mem_find_by_name(const char *name, unsigned long *start, unsigned long *size);
>+
> #endif /* _LINUX_MM_H */
>diff --git a/mm/memblock.c b/mm/memblock.c
>index d09136e040d3..a8bf0ee9e2b4 100644
>--- a/mm/memblock.c
>+++ b/mm/memblock.c
>@@ -2244,6 +2244,103 @@ void __init memblock_free_all(void)
> 	totalram_pages_add(pages);
> }
> 
>+/* Keep a table to reserve named memory */
>+#define RESERVE_MEM_MAX_ENTRIES		8
>+#define RESERVE_MEM_NAME_SIZE		16
>+struct reserve_mem_table {
>+	char			name[RESERVE_MEM_NAME_SIZE];
>+	unsigned long		start;
>+	unsigned long		size;
>+};
>+static struct reserve_mem_table reserved_mem_table[RESERVE_MEM_MAX_ENTRIES];
>+static int reserved_mem_count;
>+
>+/* Add wildcard region with a lookup name */
>+static int __init reserved_mem_add(unsigned long start, unsigned long size,
>+				   const char *name)
>+{
>+	struct reserve_mem_table *map;
>+
>+	if (!name || !name[0] || strlen(name) >= RESERVE_MEM_NAME_SIZE)
>+		return -EINVAL;
>+
>+	if (reserved_mem_count >= RESERVE_MEM_MAX_ENTRIES)
>+		return -1;
>+
>+	map = &reserved_mem_table[reserved_mem_count++];
>+	map->start = start;
>+	map->size = size;
>+	strscpy(map->name, name);
>+	return 0;
>+}
>+
>+/**
>+ * reserve_mem_find_by_name - Find reserved memory region with a given name
>+ * @name: The name that is attached to a reserved memory region
>+ * @start: If found, holds the start address
>+ * @size: If found, holds the size of the address.
>+ *
>+ * Returns: 1 if found or 0 if not found.
>+ */
>+int reserve_mem_find_by_name(const char *name, unsigned long *start, unsigned long *size)
>+{
>+	struct reserve_mem_table *map;
>+	int i;
>+
>+	for (i = 0; i < reserved_mem_count; i++) {
>+		map = &reserved_mem_table[i];
>+		if (!map->size)
>+			continue;
>+		if (strcmp(name, map->name) == 0) {
>+			*start = map->start;
>+			*size = map->size;
>+			return 1;
>+		}
>+	}
>+	return 0;
>+}
>+
>+/*
>+ * Parse early_reserve_mem=nn:align:name
>+ */
>+static int __init reserve_mem(char *p)
>+{
>+	phys_addr_t start, size, align;
>+	char *oldp;
>+	int err;
>+
>+	if (!p)
>+		return -EINVAL;
>+
>+	oldp = p;
>+	size = memparse(p, &p);
>+	if (p == oldp)
>+		return -EINVAL;
>+
>+	if (*p != ':')
>+		return -EINVAL;
>+
>+	align = memparse(p+1, &p);
>+	if (*p != ':')
>+		return -EINVAL;
>+
>+	start = memblock_phys_alloc(size, align);
>+	if (!start)
>+		return -ENOMEM;
>+
>+	p++;
>+	err = reserved_mem_add(start, size, p);
>+	if (err) {
>+		memblock_phys_free(start, size);
>+		return err;
>+	}
>+
>+	p += strlen(p);
>+
>+	return *p == '\0' ? 0: -EINVAL;
>+}
>+__setup("reserve_mem=", reserve_mem);
>+
> #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_ARCH_KEEP_MEMBLOCK)
> static const char * const flagname[] = {
> 	[ilog2(MEMBLOCK_HOTPLUG)] = "HOTPLUG",
Ard Biesheuvel June 4, 2024, 6:03 a.m. UTC | #2
On Tue, 4 Jun 2024 at 01:35, Steven Rostedt <rostedt@goodmis.org> wrote:
>
> From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
>
> In order to allow for requesting a memory region that can be used for
> things like pstore on multiple machines where the memory layout is not the
> same, add a new option to the kernel command line called "reserve_mem".
>
> The format is:  reserve_mem=nn:align:name
>
> Where it will find nn amount of memory at the given alignment of align.
> The name field is to allow another subsystem to retrieve where the memory
> was found. For example:
>
>   reserve_mem=12M:4096:oops ramoops.mem_name=oops
>
> Where ramoops.mem_name will tell ramoops that memory was reserved for it
> via the reserve_mem option and it can find it by calling:
>
>   if (reserve_mem_find_by_name("oops", &start, &size)) {
>         // start holds the start address and size holds the size given
>
> Link: https://lore.kernel.org/all/ZjJVnZUX3NZiGW6q@kernel.org/
>
> Suggested-by: Mike Rapoport <rppt@kernel.org>
> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>

You failed to point out in the commit message that the assumption here
is that this memory will retain its contents across a soft reboot. Or
am I misunderstanding this?

In any case, as I pointed out before, playing these games unilaterally
from the kernel side, i.e., without any awareness whatsoever from the
firmware and bootloader (which will not attempt to preserve RAM
contents), is likely to have a rather disappointing success ratio in
the general case. I understand this may be different for vertically
integrated software stacks like ChromeOS so perhaps it should live
there as a feature.

Then, as Kees points out, there is also the risk that the kernel
itself may be stepping on this memory before having realized that it
is reserved. At least ARM and x86 have decompressors with a
substantial amount of non-trivial placement logic that would need to
be made aware of this reservation. Note that EFI vs. non-EFI boot also
makes a difference here.


> ---
>  include/linux/mm.h |  2 +
>  mm/memblock.c      | 97 ++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 99 insertions(+)
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 9849dfda44d4..b4455cc02f2c 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -4263,4 +4263,6 @@ static inline bool pfn_is_unaccepted_memory(unsigned long pfn)
>  void vma_pgtable_walk_begin(struct vm_area_struct *vma);
>  void vma_pgtable_walk_end(struct vm_area_struct *vma);
>
> +int reserve_mem_find_by_name(const char *name, unsigned long *start, unsigned long *size);
> +
>  #endif /* _LINUX_MM_H */
> diff --git a/mm/memblock.c b/mm/memblock.c
> index d09136e040d3..a8bf0ee9e2b4 100644
> --- a/mm/memblock.c
> +++ b/mm/memblock.c
> @@ -2244,6 +2244,103 @@ void __init memblock_free_all(void)
>         totalram_pages_add(pages);
>  }
>
> +/* Keep a table to reserve named memory */
> +#define RESERVE_MEM_MAX_ENTRIES                8
> +#define RESERVE_MEM_NAME_SIZE          16
> +struct reserve_mem_table {
> +       char                    name[RESERVE_MEM_NAME_SIZE];
> +       unsigned long           start;
> +       unsigned long           size;
> +};
> +static struct reserve_mem_table reserved_mem_table[RESERVE_MEM_MAX_ENTRIES];
> +static int reserved_mem_count;
> +
> +/* Add wildcard region with a lookup name */
> +static int __init reserved_mem_add(unsigned long start, unsigned long size,
> +                                  const char *name)
> +{
> +       struct reserve_mem_table *map;
> +
> +       if (!name || !name[0] || strlen(name) >= RESERVE_MEM_NAME_SIZE)
> +               return -EINVAL;
> +
> +       if (reserved_mem_count >= RESERVE_MEM_MAX_ENTRIES)
> +               return -1;
> +
> +       map = &reserved_mem_table[reserved_mem_count++];
> +       map->start = start;
> +       map->size = size;
> +       strscpy(map->name, name);
> +       return 0;
> +}
> +
> +/**
> + * reserve_mem_find_by_name - Find reserved memory region with a given name
> + * @name: The name that is attached to a reserved memory region
> + * @start: If found, holds the start address
> + * @size: If found, holds the size of the address.
> + *
> + * Returns: 1 if found or 0 if not found.
> + */
> +int reserve_mem_find_by_name(const char *name, unsigned long *start, unsigned long *size)
> +{
> +       struct reserve_mem_table *map;
> +       int i;
> +
> +       for (i = 0; i < reserved_mem_count; i++) {
> +               map = &reserved_mem_table[i];
> +               if (!map->size)
> +                       continue;
> +               if (strcmp(name, map->name) == 0) {
> +                       *start = map->start;
> +                       *size = map->size;
> +                       return 1;
> +               }
> +       }
> +       return 0;
> +}
> +
> +/*
> + * Parse early_reserve_mem=nn:align:name
> + */
> +static int __init reserve_mem(char *p)
> +{
> +       phys_addr_t start, size, align;
> +       char *oldp;
> +       int err;
> +
> +       if (!p)
> +               return -EINVAL;
> +
> +       oldp = p;
> +       size = memparse(p, &p);
> +       if (p == oldp)
> +               return -EINVAL;
> +
> +       if (*p != ':')
> +               return -EINVAL;
> +
> +       align = memparse(p+1, &p);
> +       if (*p != ':')
> +               return -EINVAL;
> +
> +       start = memblock_phys_alloc(size, align);
> +       if (!start)
> +               return -ENOMEM;
> +
> +       p++;
> +       err = reserved_mem_add(start, size, p);
> +       if (err) {
> +               memblock_phys_free(start, size);
> +               return err;
> +       }
> +
> +       p += strlen(p);
> +
> +       return *p == '\0' ? 0: -EINVAL;
> +}
> +__setup("reserve_mem=", reserve_mem);
> +
>  #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_ARCH_KEEP_MEMBLOCK)
>  static const char * const flagname[] = {
>         [ilog2(MEMBLOCK_HOTPLUG)] = "HOTPLUG",
> --
> 2.43.0
>
>
>
Steven Rostedt June 4, 2024, 10:57 a.m. UTC | #3
On Mon, 03 Jun 2024 22:52:37 -0700
Kees Cook <kees@kernel.org> wrote:

> On June 3, 2024 4:33:31 PM PDT, Steven Rostedt <rostedt@goodmis.org> wrote:
> >From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
> >
> >In order to allow for requesting a memory region that can be used for
> >things like pstore on multiple machines where the memory layout is not the
> >same, add a new option to the kernel command line called "reserve_mem".
> >
> >The format is:  reserve_mem=nn:align:name
> >
> >Where it will find nn amount of memory at the given alignment of align.
> >The name field is to allow another subsystem to retrieve where the memory
> >was found. For example:
> >
> >  reserve_mem=12M:4096:oops ramoops.mem_name=oops  
> 
> How does this interact with KASLR? It has chosen its physical location
> before this parsing happens, so I'd expect this to fail once in a while,
> unless the size/alignment is lucky enough that KASLR never uses that
> portion of the physical memory...
> 

From looking at the KASLR code, it looks to me that it picks from 100
different locations. I could be wrong, but if you have sufficient memory,
I'm thinking that it should not conflict. But if it does, yes, it will fail
to pick the same location.

-- Steve
Steven Rostedt June 4, 2024, 11:08 a.m. UTC | #4
On Tue, 4 Jun 2024 08:03:54 +0200
Ard Biesheuvel <ardb@kernel.org> wrote:

> On Tue, 4 Jun 2024 at 01:35, Steven Rostedt <rostedt@goodmis.org> wrote:
> >
> > From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
> >
> > In order to allow for requesting a memory region that can be used for
> > things like pstore on multiple machines where the memory layout is not the
> > same, add a new option to the kernel command line called "reserve_mem".
> >
> > The format is:  reserve_mem=nn:align:name
> >
> > Where it will find nn amount of memory at the given alignment of align.
> > The name field is to allow another subsystem to retrieve where the memory
> > was found. For example:
> >
> >   reserve_mem=12M:4096:oops ramoops.mem_name=oops
> >
> > Where ramoops.mem_name will tell ramoops that memory was reserved for it
> > via the reserve_mem option and it can find it by calling:
> >
> >   if (reserve_mem_find_by_name("oops", &start, &size)) {
> >         // start holds the start address and size holds the size given
> >
> > Link: https://lore.kernel.org/all/ZjJVnZUX3NZiGW6q@kernel.org/
> >
> > Suggested-by: Mike Rapoport <rppt@kernel.org>
> > Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>  
> 
> You failed to point out in the commit message that the assumption here
> is that this memory will retain its contents across a soft reboot. Or
> am I misunderstanding this?

Yes that is the intention. I should update the commit message.

> 
> In any case, as I pointed out before, playing these games unilaterally
> from the kernel side, i.e., without any awareness whatsoever from the
> firmware and bootloader (which will not attempt to preserve RAM
> contents), is likely to have a rather disappointing success ratio in
> the general case. I understand this may be different for vertically
> integrated software stacks like ChromeOS so perhaps it should live
> there as a feature.

I have been using this on two different test machines, as well as a
chromebook, and it appears to work on all ofthem. As well as for VMs. I
plan on adding this to my workstation and server too (they use EFI).

> 
> Then, as Kees points out, there is also the risk that the kernel
> itself may be stepping on this memory before having realized that it
> is reserved. At least ARM and x86 have decompressors with a
> substantial amount of non-trivial placement logic that would need to
> be made aware of this reservation. Note that EFI vs. non-EFI boot also
> makes a difference here.

Agreed. Note, it should definitely state that this is not 100% reliable,
and depending on the setup it may not be reliable at all. Whatever uses it
should add something to confirm that the memory is the same.

If corner cases become an issue, this could be extended to work with them.
We could update KASLR to be aware of this allocation. The documentation
update to kernel-parameters.txt on this usage should definitely stress that
this can be unreliable, and use should be tested to see if it works. And
also stress that if it does work, it may not work all the time. The best
usage for this is for statistical debugging. For instance, in our use case,
we have 1000s of crashes that we have no idea why. If this worked only 10%
of the time, the data retrieved from 100 of those crashes would be very
valuable.

-- Steve
Tony Luck June 4, 2024, 4:05 p.m. UTC | #5
> I have been using this on two different test machines, as well as a
> chromebook, and it appears to work on all ofthem. As well as for VMs. I
> plan on adding this to my workstation and server too (they use EFI).

I think that BIOS on Intel servers with ECC memory will stomp on all
memory (to ensure that ECC bits are all set to good values). There
might be a "fast boot" BIOS option to skip this (but using it leaves you
vulnerable after a crash due to ECC fail to hit the same error again).

-Tony
Steven Rostedt June 6, 2024, 2:50 p.m. UTC | #6
On Tue, 4 Jun 2024 16:05:04 +0000
"Luck, Tony" <tony.luck@intel.com> wrote:

> > I have been using this on two different test machines, as well as a
> > chromebook, and it appears to work on all ofthem. As well as for VMs. I
> > plan on adding this to my workstation and server too (they use EFI).  
> 
> I think that BIOS on Intel servers with ECC memory will stomp on all
> memory (to ensure that ECC bits are all set to good values). There
> might be a "fast boot" BIOS option to skip this (but using it leaves you
> vulnerable after a crash due to ECC fail to hit the same error again).
> 

Talking with some people that are interested in this, they told me that
those servers (the ones that take several minutes to boot up) usually
use kexec to reboot. Even after a crash (with or without kdump). In
those cases, they said this would likely work for them.

Again, this isn't fool proof nor guaranteed. It's a best effort
approach that, at least for my use case, works most of the time.

-- Steve
diff mbox series

Patch

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9849dfda44d4..b4455cc02f2c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -4263,4 +4263,6 @@  static inline bool pfn_is_unaccepted_memory(unsigned long pfn)
 void vma_pgtable_walk_begin(struct vm_area_struct *vma);
 void vma_pgtable_walk_end(struct vm_area_struct *vma);
 
+int reserve_mem_find_by_name(const char *name, unsigned long *start, unsigned long *size);
+
 #endif /* _LINUX_MM_H */
diff --git a/mm/memblock.c b/mm/memblock.c
index d09136e040d3..a8bf0ee9e2b4 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -2244,6 +2244,103 @@  void __init memblock_free_all(void)
 	totalram_pages_add(pages);
 }
 
+/* Keep a table to reserve named memory */
+#define RESERVE_MEM_MAX_ENTRIES		8
+#define RESERVE_MEM_NAME_SIZE		16
+struct reserve_mem_table {
+	char			name[RESERVE_MEM_NAME_SIZE];
+	unsigned long		start;
+	unsigned long		size;
+};
+static struct reserve_mem_table reserved_mem_table[RESERVE_MEM_MAX_ENTRIES];
+static int reserved_mem_count;
+
+/* Add wildcard region with a lookup name */
+static int __init reserved_mem_add(unsigned long start, unsigned long size,
+				   const char *name)
+{
+	struct reserve_mem_table *map;
+
+	if (!name || !name[0] || strlen(name) >= RESERVE_MEM_NAME_SIZE)
+		return -EINVAL;
+
+	if (reserved_mem_count >= RESERVE_MEM_MAX_ENTRIES)
+		return -1;
+
+	map = &reserved_mem_table[reserved_mem_count++];
+	map->start = start;
+	map->size = size;
+	strscpy(map->name, name);
+	return 0;
+}
+
+/**
+ * reserve_mem_find_by_name - Find reserved memory region with a given name
+ * @name: The name that is attached to a reserved memory region
+ * @start: If found, holds the start address
+ * @size: If found, holds the size of the address.
+ *
+ * Returns: 1 if found or 0 if not found.
+ */
+int reserve_mem_find_by_name(const char *name, unsigned long *start, unsigned long *size)
+{
+	struct reserve_mem_table *map;
+	int i;
+
+	for (i = 0; i < reserved_mem_count; i++) {
+		map = &reserved_mem_table[i];
+		if (!map->size)
+			continue;
+		if (strcmp(name, map->name) == 0) {
+			*start = map->start;
+			*size = map->size;
+			return 1;
+		}
+	}
+	return 0;
+}
+
+/*
+ * Parse early_reserve_mem=nn:align:name
+ */
+static int __init reserve_mem(char *p)
+{
+	phys_addr_t start, size, align;
+	char *oldp;
+	int err;
+
+	if (!p)
+		return -EINVAL;
+
+	oldp = p;
+	size = memparse(p, &p);
+	if (p == oldp)
+		return -EINVAL;
+
+	if (*p != ':')
+		return -EINVAL;
+
+	align = memparse(p+1, &p);
+	if (*p != ':')
+		return -EINVAL;
+
+	start = memblock_phys_alloc(size, align);
+	if (!start)
+		return -ENOMEM;
+
+	p++;
+	err = reserved_mem_add(start, size, p);
+	if (err) {
+		memblock_phys_free(start, size);
+		return err;
+	}
+
+	p += strlen(p);
+
+	return *p == '\0' ? 0: -EINVAL;
+}
+__setup("reserve_mem=", reserve_mem);
+
 #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_ARCH_KEEP_MEMBLOCK)
 static const char * const flagname[] = {
 	[ilog2(MEMBLOCK_HOTPLUG)] = "HOTPLUG",