Message ID | 20240603233631.452433539@goodmis.org (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | mm/pstore: Reserve named unspecified memory across boots | expand |
On June 3, 2024 4:33:31 PM PDT, Steven Rostedt <rostedt@goodmis.org> wrote: >From: "Steven Rostedt (Google)" <rostedt@goodmis.org> > >In order to allow for requesting a memory region that can be used for >things like pstore on multiple machines where the memory layout is not the >same, add a new option to the kernel command line called "reserve_mem". > >The format is: reserve_mem=nn:align:name > >Where it will find nn amount of memory at the given alignment of align. >The name field is to allow another subsystem to retrieve where the memory >was found. For example: > > reserve_mem=12M:4096:oops ramoops.mem_name=oops How does this interact with KASLR? It has chosen its physical location before this parsing happens, so I'd expect this to fail once in a while, unless the size/alignment is lucky enough that KASLR never uses that portion of the physical memory... -Kees > >Where ramoops.mem_name will tell ramoops that memory was reserved for it >via the reserve_mem option and it can find it by calling: > > if (reserve_mem_find_by_name("oops", &start, &size)) { > // start holds the start address and size holds the size given > >Link: https://lore.kernel.org/all/ZjJVnZUX3NZiGW6q@kernel.org/ > >Suggested-by: Mike Rapoport <rppt@kernel.org> >Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org> >--- > include/linux/mm.h | 2 + > mm/memblock.c | 97 ++++++++++++++++++++++++++++++++++++++++++++++ > 2 files changed, 99 insertions(+) > >diff --git a/include/linux/mm.h b/include/linux/mm.h >index 9849dfda44d4..b4455cc02f2c 100644 >--- a/include/linux/mm.h >+++ b/include/linux/mm.h >@@ -4263,4 +4263,6 @@ static inline bool pfn_is_unaccepted_memory(unsigned long pfn) > void vma_pgtable_walk_begin(struct vm_area_struct *vma); > void vma_pgtable_walk_end(struct vm_area_struct *vma); > >+int reserve_mem_find_by_name(const char *name, unsigned long *start, unsigned long *size); >+ > #endif /* _LINUX_MM_H */ >diff --git a/mm/memblock.c b/mm/memblock.c >index d09136e040d3..a8bf0ee9e2b4 100644 >--- a/mm/memblock.c >+++ b/mm/memblock.c >@@ -2244,6 +2244,103 @@ void __init memblock_free_all(void) > totalram_pages_add(pages); > } > >+/* Keep a table to reserve named memory */ >+#define RESERVE_MEM_MAX_ENTRIES 8 >+#define RESERVE_MEM_NAME_SIZE 16 >+struct reserve_mem_table { >+ char name[RESERVE_MEM_NAME_SIZE]; >+ unsigned long start; >+ unsigned long size; >+}; >+static struct reserve_mem_table reserved_mem_table[RESERVE_MEM_MAX_ENTRIES]; >+static int reserved_mem_count; >+ >+/* Add wildcard region with a lookup name */ >+static int __init reserved_mem_add(unsigned long start, unsigned long size, >+ const char *name) >+{ >+ struct reserve_mem_table *map; >+ >+ if (!name || !name[0] || strlen(name) >= RESERVE_MEM_NAME_SIZE) >+ return -EINVAL; >+ >+ if (reserved_mem_count >= RESERVE_MEM_MAX_ENTRIES) >+ return -1; >+ >+ map = &reserved_mem_table[reserved_mem_count++]; >+ map->start = start; >+ map->size = size; >+ strscpy(map->name, name); >+ return 0; >+} >+ >+/** >+ * reserve_mem_find_by_name - Find reserved memory region with a given name >+ * @name: The name that is attached to a reserved memory region >+ * @start: If found, holds the start address >+ * @size: If found, holds the size of the address. >+ * >+ * Returns: 1 if found or 0 if not found. >+ */ >+int reserve_mem_find_by_name(const char *name, unsigned long *start, unsigned long *size) >+{ >+ struct reserve_mem_table *map; >+ int i; >+ >+ for (i = 0; i < reserved_mem_count; i++) { >+ map = &reserved_mem_table[i]; >+ if (!map->size) >+ continue; >+ if (strcmp(name, map->name) == 0) { >+ *start = map->start; >+ *size = map->size; >+ return 1; >+ } >+ } >+ return 0; >+} >+ >+/* >+ * Parse early_reserve_mem=nn:align:name >+ */ >+static int __init reserve_mem(char *p) >+{ >+ phys_addr_t start, size, align; >+ char *oldp; >+ int err; >+ >+ if (!p) >+ return -EINVAL; >+ >+ oldp = p; >+ size = memparse(p, &p); >+ if (p == oldp) >+ return -EINVAL; >+ >+ if (*p != ':') >+ return -EINVAL; >+ >+ align = memparse(p+1, &p); >+ if (*p != ':') >+ return -EINVAL; >+ >+ start = memblock_phys_alloc(size, align); >+ if (!start) >+ return -ENOMEM; >+ >+ p++; >+ err = reserved_mem_add(start, size, p); >+ if (err) { >+ memblock_phys_free(start, size); >+ return err; >+ } >+ >+ p += strlen(p); >+ >+ return *p == '\0' ? 0: -EINVAL; >+} >+__setup("reserve_mem=", reserve_mem); >+ > #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_ARCH_KEEP_MEMBLOCK) > static const char * const flagname[] = { > [ilog2(MEMBLOCK_HOTPLUG)] = "HOTPLUG",
On Tue, 4 Jun 2024 at 01:35, Steven Rostedt <rostedt@goodmis.org> wrote: > > From: "Steven Rostedt (Google)" <rostedt@goodmis.org> > > In order to allow for requesting a memory region that can be used for > things like pstore on multiple machines where the memory layout is not the > same, add a new option to the kernel command line called "reserve_mem". > > The format is: reserve_mem=nn:align:name > > Where it will find nn amount of memory at the given alignment of align. > The name field is to allow another subsystem to retrieve where the memory > was found. For example: > > reserve_mem=12M:4096:oops ramoops.mem_name=oops > > Where ramoops.mem_name will tell ramoops that memory was reserved for it > via the reserve_mem option and it can find it by calling: > > if (reserve_mem_find_by_name("oops", &start, &size)) { > // start holds the start address and size holds the size given > > Link: https://lore.kernel.org/all/ZjJVnZUX3NZiGW6q@kernel.org/ > > Suggested-by: Mike Rapoport <rppt@kernel.org> > Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org> You failed to point out in the commit message that the assumption here is that this memory will retain its contents across a soft reboot. Or am I misunderstanding this? In any case, as I pointed out before, playing these games unilaterally from the kernel side, i.e., without any awareness whatsoever from the firmware and bootloader (which will not attempt to preserve RAM contents), is likely to have a rather disappointing success ratio in the general case. I understand this may be different for vertically integrated software stacks like ChromeOS so perhaps it should live there as a feature. Then, as Kees points out, there is also the risk that the kernel itself may be stepping on this memory before having realized that it is reserved. At least ARM and x86 have decompressors with a substantial amount of non-trivial placement logic that would need to be made aware of this reservation. Note that EFI vs. non-EFI boot also makes a difference here. > --- > include/linux/mm.h | 2 + > mm/memblock.c | 97 ++++++++++++++++++++++++++++++++++++++++++++++ > 2 files changed, 99 insertions(+) > > diff --git a/include/linux/mm.h b/include/linux/mm.h > index 9849dfda44d4..b4455cc02f2c 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -4263,4 +4263,6 @@ static inline bool pfn_is_unaccepted_memory(unsigned long pfn) > void vma_pgtable_walk_begin(struct vm_area_struct *vma); > void vma_pgtable_walk_end(struct vm_area_struct *vma); > > +int reserve_mem_find_by_name(const char *name, unsigned long *start, unsigned long *size); > + > #endif /* _LINUX_MM_H */ > diff --git a/mm/memblock.c b/mm/memblock.c > index d09136e040d3..a8bf0ee9e2b4 100644 > --- a/mm/memblock.c > +++ b/mm/memblock.c > @@ -2244,6 +2244,103 @@ void __init memblock_free_all(void) > totalram_pages_add(pages); > } > > +/* Keep a table to reserve named memory */ > +#define RESERVE_MEM_MAX_ENTRIES 8 > +#define RESERVE_MEM_NAME_SIZE 16 > +struct reserve_mem_table { > + char name[RESERVE_MEM_NAME_SIZE]; > + unsigned long start; > + unsigned long size; > +}; > +static struct reserve_mem_table reserved_mem_table[RESERVE_MEM_MAX_ENTRIES]; > +static int reserved_mem_count; > + > +/* Add wildcard region with a lookup name */ > +static int __init reserved_mem_add(unsigned long start, unsigned long size, > + const char *name) > +{ > + struct reserve_mem_table *map; > + > + if (!name || !name[0] || strlen(name) >= RESERVE_MEM_NAME_SIZE) > + return -EINVAL; > + > + if (reserved_mem_count >= RESERVE_MEM_MAX_ENTRIES) > + return -1; > + > + map = &reserved_mem_table[reserved_mem_count++]; > + map->start = start; > + map->size = size; > + strscpy(map->name, name); > + return 0; > +} > + > +/** > + * reserve_mem_find_by_name - Find reserved memory region with a given name > + * @name: The name that is attached to a reserved memory region > + * @start: If found, holds the start address > + * @size: If found, holds the size of the address. > + * > + * Returns: 1 if found or 0 if not found. > + */ > +int reserve_mem_find_by_name(const char *name, unsigned long *start, unsigned long *size) > +{ > + struct reserve_mem_table *map; > + int i; > + > + for (i = 0; i < reserved_mem_count; i++) { > + map = &reserved_mem_table[i]; > + if (!map->size) > + continue; > + if (strcmp(name, map->name) == 0) { > + *start = map->start; > + *size = map->size; > + return 1; > + } > + } > + return 0; > +} > + > +/* > + * Parse early_reserve_mem=nn:align:name > + */ > +static int __init reserve_mem(char *p) > +{ > + phys_addr_t start, size, align; > + char *oldp; > + int err; > + > + if (!p) > + return -EINVAL; > + > + oldp = p; > + size = memparse(p, &p); > + if (p == oldp) > + return -EINVAL; > + > + if (*p != ':') > + return -EINVAL; > + > + align = memparse(p+1, &p); > + if (*p != ':') > + return -EINVAL; > + > + start = memblock_phys_alloc(size, align); > + if (!start) > + return -ENOMEM; > + > + p++; > + err = reserved_mem_add(start, size, p); > + if (err) { > + memblock_phys_free(start, size); > + return err; > + } > + > + p += strlen(p); > + > + return *p == '\0' ? 0: -EINVAL; > +} > +__setup("reserve_mem=", reserve_mem); > + > #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_ARCH_KEEP_MEMBLOCK) > static const char * const flagname[] = { > [ilog2(MEMBLOCK_HOTPLUG)] = "HOTPLUG", > -- > 2.43.0 > > >
On Mon, 03 Jun 2024 22:52:37 -0700 Kees Cook <kees@kernel.org> wrote: > On June 3, 2024 4:33:31 PM PDT, Steven Rostedt <rostedt@goodmis.org> wrote: > >From: "Steven Rostedt (Google)" <rostedt@goodmis.org> > > > >In order to allow for requesting a memory region that can be used for > >things like pstore on multiple machines where the memory layout is not the > >same, add a new option to the kernel command line called "reserve_mem". > > > >The format is: reserve_mem=nn:align:name > > > >Where it will find nn amount of memory at the given alignment of align. > >The name field is to allow another subsystem to retrieve where the memory > >was found. For example: > > > > reserve_mem=12M:4096:oops ramoops.mem_name=oops > > How does this interact with KASLR? It has chosen its physical location > before this parsing happens, so I'd expect this to fail once in a while, > unless the size/alignment is lucky enough that KASLR never uses that > portion of the physical memory... > From looking at the KASLR code, it looks to me that it picks from 100 different locations. I could be wrong, but if you have sufficient memory, I'm thinking that it should not conflict. But if it does, yes, it will fail to pick the same location. -- Steve
On Tue, 4 Jun 2024 08:03:54 +0200 Ard Biesheuvel <ardb@kernel.org> wrote: > On Tue, 4 Jun 2024 at 01:35, Steven Rostedt <rostedt@goodmis.org> wrote: > > > > From: "Steven Rostedt (Google)" <rostedt@goodmis.org> > > > > In order to allow for requesting a memory region that can be used for > > things like pstore on multiple machines where the memory layout is not the > > same, add a new option to the kernel command line called "reserve_mem". > > > > The format is: reserve_mem=nn:align:name > > > > Where it will find nn amount of memory at the given alignment of align. > > The name field is to allow another subsystem to retrieve where the memory > > was found. For example: > > > > reserve_mem=12M:4096:oops ramoops.mem_name=oops > > > > Where ramoops.mem_name will tell ramoops that memory was reserved for it > > via the reserve_mem option and it can find it by calling: > > > > if (reserve_mem_find_by_name("oops", &start, &size)) { > > // start holds the start address and size holds the size given > > > > Link: https://lore.kernel.org/all/ZjJVnZUX3NZiGW6q@kernel.org/ > > > > Suggested-by: Mike Rapoport <rppt@kernel.org> > > Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org> > > You failed to point out in the commit message that the assumption here > is that this memory will retain its contents across a soft reboot. Or > am I misunderstanding this? Yes that is the intention. I should update the commit message. > > In any case, as I pointed out before, playing these games unilaterally > from the kernel side, i.e., without any awareness whatsoever from the > firmware and bootloader (which will not attempt to preserve RAM > contents), is likely to have a rather disappointing success ratio in > the general case. I understand this may be different for vertically > integrated software stacks like ChromeOS so perhaps it should live > there as a feature. I have been using this on two different test machines, as well as a chromebook, and it appears to work on all ofthem. As well as for VMs. I plan on adding this to my workstation and server too (they use EFI). > > Then, as Kees points out, there is also the risk that the kernel > itself may be stepping on this memory before having realized that it > is reserved. At least ARM and x86 have decompressors with a > substantial amount of non-trivial placement logic that would need to > be made aware of this reservation. Note that EFI vs. non-EFI boot also > makes a difference here. Agreed. Note, it should definitely state that this is not 100% reliable, and depending on the setup it may not be reliable at all. Whatever uses it should add something to confirm that the memory is the same. If corner cases become an issue, this could be extended to work with them. We could update KASLR to be aware of this allocation. The documentation update to kernel-parameters.txt on this usage should definitely stress that this can be unreliable, and use should be tested to see if it works. And also stress that if it does work, it may not work all the time. The best usage for this is for statistical debugging. For instance, in our use case, we have 1000s of crashes that we have no idea why. If this worked only 10% of the time, the data retrieved from 100 of those crashes would be very valuable. -- Steve
> I have been using this on two different test machines, as well as a > chromebook, and it appears to work on all ofthem. As well as for VMs. I > plan on adding this to my workstation and server too (they use EFI). I think that BIOS on Intel servers with ECC memory will stomp on all memory (to ensure that ECC bits are all set to good values). There might be a "fast boot" BIOS option to skip this (but using it leaves you vulnerable after a crash due to ECC fail to hit the same error again). -Tony
On Tue, 4 Jun 2024 16:05:04 +0000 "Luck, Tony" <tony.luck@intel.com> wrote: > > I have been using this on two different test machines, as well as a > > chromebook, and it appears to work on all ofthem. As well as for VMs. I > > plan on adding this to my workstation and server too (they use EFI). > > I think that BIOS on Intel servers with ECC memory will stomp on all > memory (to ensure that ECC bits are all set to good values). There > might be a "fast boot" BIOS option to skip this (but using it leaves you > vulnerable after a crash due to ECC fail to hit the same error again). > Talking with some people that are interested in this, they told me that those servers (the ones that take several minutes to boot up) usually use kexec to reboot. Even after a crash (with or without kdump). In those cases, they said this would likely work for them. Again, this isn't fool proof nor guaranteed. It's a best effort approach that, at least for my use case, works most of the time. -- Steve
diff --git a/include/linux/mm.h b/include/linux/mm.h index 9849dfda44d4..b4455cc02f2c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4263,4 +4263,6 @@ static inline bool pfn_is_unaccepted_memory(unsigned long pfn) void vma_pgtable_walk_begin(struct vm_area_struct *vma); void vma_pgtable_walk_end(struct vm_area_struct *vma); +int reserve_mem_find_by_name(const char *name, unsigned long *start, unsigned long *size); + #endif /* _LINUX_MM_H */ diff --git a/mm/memblock.c b/mm/memblock.c index d09136e040d3..a8bf0ee9e2b4 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -2244,6 +2244,103 @@ void __init memblock_free_all(void) totalram_pages_add(pages); } +/* Keep a table to reserve named memory */ +#define RESERVE_MEM_MAX_ENTRIES 8 +#define RESERVE_MEM_NAME_SIZE 16 +struct reserve_mem_table { + char name[RESERVE_MEM_NAME_SIZE]; + unsigned long start; + unsigned long size; +}; +static struct reserve_mem_table reserved_mem_table[RESERVE_MEM_MAX_ENTRIES]; +static int reserved_mem_count; + +/* Add wildcard region with a lookup name */ +static int __init reserved_mem_add(unsigned long start, unsigned long size, + const char *name) +{ + struct reserve_mem_table *map; + + if (!name || !name[0] || strlen(name) >= RESERVE_MEM_NAME_SIZE) + return -EINVAL; + + if (reserved_mem_count >= RESERVE_MEM_MAX_ENTRIES) + return -1; + + map = &reserved_mem_table[reserved_mem_count++]; + map->start = start; + map->size = size; + strscpy(map->name, name); + return 0; +} + +/** + * reserve_mem_find_by_name - Find reserved memory region with a given name + * @name: The name that is attached to a reserved memory region + * @start: If found, holds the start address + * @size: If found, holds the size of the address. + * + * Returns: 1 if found or 0 if not found. + */ +int reserve_mem_find_by_name(const char *name, unsigned long *start, unsigned long *size) +{ + struct reserve_mem_table *map; + int i; + + for (i = 0; i < reserved_mem_count; i++) { + map = &reserved_mem_table[i]; + if (!map->size) + continue; + if (strcmp(name, map->name) == 0) { + *start = map->start; + *size = map->size; + return 1; + } + } + return 0; +} + +/* + * Parse early_reserve_mem=nn:align:name + */ +static int __init reserve_mem(char *p) +{ + phys_addr_t start, size, align; + char *oldp; + int err; + + if (!p) + return -EINVAL; + + oldp = p; + size = memparse(p, &p); + if (p == oldp) + return -EINVAL; + + if (*p != ':') + return -EINVAL; + + align = memparse(p+1, &p); + if (*p != ':') + return -EINVAL; + + start = memblock_phys_alloc(size, align); + if (!start) + return -ENOMEM; + + p++; + err = reserved_mem_add(start, size, p); + if (err) { + memblock_phys_free(start, size); + return err; + } + + p += strlen(p); + + return *p == '\0' ? 0: -EINVAL; +} +__setup("reserve_mem=", reserve_mem); + #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_ARCH_KEEP_MEMBLOCK) static const char * const flagname[] = { [ilog2(MEMBLOCK_HOTPLUG)] = "HOTPLUG",