diff mbox series

[v7,3/6] accel/kvm: Report the loss of a large memory page

Message ID 20250201095726.3768796-4-william.roche@oracle.com (mailing list archive)
State New, archived
Headers show
Series Poisoned memory recovery on reboot | expand

Commit Message

William Roche Feb. 1, 2025, 9:57 a.m. UTC
From: William Roche <william.roche@oracle.com>

In case of a large page impacted by a memory error, provide an
information about the impacted large page before the memory
error injection message.

This message would also appear on ras enabled ARM platforms, with
the introduction of an x86 similar error injection message.

In the case of a large page impacted, we now report:
Memory Error on large page from <backend>:<address>+<fd_offset> +<page_size>

The +<fd_offset> information is only provided with a file backend.

Signed-off-by: William Roche <william.roche@oracle.com>
---
 accel/kvm/kvm-all.c       | 18 ++++++++++++++++++
 include/exec/cpu-common.h | 10 ++++++++++
 system/physmem.c          | 22 ++++++++++++++++++++++
 target/arm/kvm.c          |  3 +++
 4 files changed, 53 insertions(+)

Comments

Peter Xu Feb. 4, 2025, 5:01 p.m. UTC | #1
On Sat, Feb 01, 2025 at 09:57:23AM +0000, “William Roche wrote:
> From: William Roche <william.roche@oracle.com>
> 
> In case of a large page impacted by a memory error, provide an
> information about the impacted large page before the memory
> error injection message.
> 
> This message would also appear on ras enabled ARM platforms, with
> the introduction of an x86 similar error injection message.
> 
> In the case of a large page impacted, we now report:
> Memory Error on large page from <backend>:<address>+<fd_offset> +<page_size>
> 
> The +<fd_offset> information is only provided with a file backend.
> 
> Signed-off-by: William Roche <william.roche@oracle.com>

This is still pretty kvm / arch relevant patch that needs some reviews.

I wonder do we really need this - we could fetch ramblock mapping
(e.g. hwaddr -> HVA) via HMP "info ramblock", and also dmesg shows process
ID + VA.  IIUC we have all below info already as long as we do some math
based on above.  Would that work too?

> ---
>  accel/kvm/kvm-all.c       | 18 ++++++++++++++++++
>  include/exec/cpu-common.h | 10 ++++++++++
>  system/physmem.c          | 22 ++++++++++++++++++++++
>  target/arm/kvm.c          |  3 +++
>  4 files changed, 53 insertions(+)
> 
> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
> index f89568bfa3..9a0d970ce1 100644
> --- a/accel/kvm/kvm-all.c
> +++ b/accel/kvm/kvm-all.c
> @@ -1296,6 +1296,24 @@ static void kvm_unpoison_all(void *param)
>  void kvm_hwpoison_page_add(ram_addr_t ram_addr)
>  {
>      HWPoisonPage *page;
> +    struct RAMBlockInfo rb_info;
> +
> +    if (qemu_ram_block_info_from_addr(ram_addr, &rb_info)) {
> +        size_t ps = rb_info.page_size;
> +
> +        if (ps > TARGET_PAGE_SIZE) {
> +            uint64_t offset = QEMU_ALIGN_DOWN(ram_addr - rb_info.offset, ps);
> +
> +            if (rb_info.fd >= 0) {
> +                error_report("Memory Error on large page from %s:%" PRIx64
> +                             "+%" PRIx64 " +%zx", rb_info.idstr, offset,
> +                             rb_info.fd_offset, ps);
> +            } else {
> +                error_report("Memory Error on large page from %s:%" PRIx64
> +                            " +%zx", rb_info.idstr, offset, ps);
> +            }
> +        }
> +    }
>  
>      QLIST_FOREACH(page, &hwpoison_page_list, list) {
>          if (page->ram_addr == ram_addr) {
> diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h
> index 3771b2130c..190bd4f34a 100644
> --- a/include/exec/cpu-common.h
> +++ b/include/exec/cpu-common.h
> @@ -110,6 +110,16 @@ int qemu_ram_get_fd(RAMBlock *rb);
>  size_t qemu_ram_pagesize(RAMBlock *block);
>  size_t qemu_ram_pagesize_largest(void);
>  
> +struct RAMBlockInfo {
> +    char idstr[256];
> +    ram_addr_t offset;
> +    int fd;
> +    uint64_t fd_offset;
> +    size_t page_size;
> +};
> +bool qemu_ram_block_info_from_addr(ram_addr_t ram_addr,
> +                                   struct RAMBlockInfo *block);
> +
>  /**
>   * cpu_address_space_init:
>   * @cpu: CPU to add this address space to
> diff --git a/system/physmem.c b/system/physmem.c
> index e8ff930bc9..686f569270 100644
> --- a/system/physmem.c
> +++ b/system/physmem.c
> @@ -1678,6 +1678,28 @@ size_t qemu_ram_pagesize_largest(void)
>      return largest;
>  }
>  
> +/* Copy RAMBlock information associated to the given ram_addr location */
> +bool qemu_ram_block_info_from_addr(ram_addr_t ram_addr,
> +                                   struct RAMBlockInfo *b_info)
> +{
> +    RAMBlock *rb;
> +
> +    assert(b_info);
> +
> +    RCU_READ_LOCK_GUARD();
> +    rb =  qemu_get_ram_block(ram_addr);
> +    if (!rb) {
> +        return false;
> +    }
> +
> +    pstrcat(b_info->idstr, sizeof(b_info->idstr), rb->idstr);
> +    b_info->offset = rb->offset;
> +    b_info->fd = rb->fd;
> +    b_info->fd_offset = rb->fd_offset;
> +    b_info->page_size = rb->page_size;
> +    return true;
> +}
> +
>  static int memory_try_enable_merging(void *addr, size_t len)
>  {
>      if (!machine_mem_merge(current_machine)) {
> diff --git a/target/arm/kvm.c b/target/arm/kvm.c
> index da30bdbb23..d9dedc6d74 100644
> --- a/target/arm/kvm.c
> +++ b/target/arm/kvm.c
> @@ -2389,6 +2389,9 @@ void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr)
>                  kvm_cpu_synchronize_state(c);
>                  if (!acpi_ghes_memory_errors(ACPI_HEST_SRC_ID_SEA, paddr)) {
>                      kvm_inject_arm_sea(c);
> +                    error_report("Guest Memory Error at QEMU addr %p and "
> +                        "GUEST addr 0x%" HWADDR_PRIx " of type %s injected",
> +                        addr, paddr, "BUS_MCEERR_AR");
>                  } else {
>                      error_report("failed to record the error");
>                      abort();
> -- 
> 2.43.5
>
William Roche Feb. 5, 2025, 4:27 p.m. UTC | #2
On 2/4/25 18:01, Peter Xu wrote:
> On Sat, Feb 01, 2025 at 09:57:23AM +0000, “William Roche wrote:
>> From: William Roche <william.roche@oracle.com>
>>
>> In case of a large page impacted by a memory error, provide an
>> information about the impacted large page before the memory
>> error injection message.
>>
>> This message would also appear on ras enabled ARM platforms, with
>> the introduction of an x86 similar error injection message.
>>
>> In the case of a large page impacted, we now report:
>> Memory Error on large page from <backend>:<address>+<fd_offset> +<page_size>
>>
>> The +<fd_offset> information is only provided with a file backend.
>>
>> Signed-off-by: William Roche <william.roche@oracle.com>
> 
> This is still pretty kvm / arch relevant patch that needs some reviews.
> 
> I wonder do we really need this - we could fetch ramblock mapping
> (e.g. hwaddr -> HVA) via HMP "info ramblock", and also dmesg shows process
> ID + VA.  IIUC we have all below info already as long as we do some math
> based on above.  Would that work too?

The HMP command "info ramblock" is implemented with the 
ram_block_format() function which returns a message buffer built with a 
string for each ramblock (protected by the RCU_READ_LOCK_GUARD). Our new 
function copies a struct with the necessary information.

Relaying on the buffer format to retrieve the information doesn't seem 
reasonable, and more importantly, this buffer doesn't provide all the 
needed data, like fd and fd_offset.

I would say that ram_block_format() and qemu_ram_block_info_from_addr() 
serve 2 different goals.

(a reimplementation of ram_block_format() with an adapted version of 
qemu_ram_block_info_from_addr() taking the extra information needed 
could be doable for example, but may not be worth doing for now)
Peter Xu Feb. 5, 2025, 5:07 p.m. UTC | #3
On Wed, Feb 05, 2025 at 05:27:13PM +0100, William Roche wrote:
> On 2/4/25 18:01, Peter Xu wrote:
> > On Sat, Feb 01, 2025 at 09:57:23AM +0000, “William Roche wrote:
> > > From: William Roche <william.roche@oracle.com>
> > > 
> > > In case of a large page impacted by a memory error, provide an
> > > information about the impacted large page before the memory
> > > error injection message.
> > > 
> > > This message would also appear on ras enabled ARM platforms, with
> > > the introduction of an x86 similar error injection message.
> > > 
> > > In the case of a large page impacted, we now report:
> > > Memory Error on large page from <backend>:<address>+<fd_offset> +<page_size>
> > > 
> > > The +<fd_offset> information is only provided with a file backend.
> > > 
> > > Signed-off-by: William Roche <william.roche@oracle.com>
> > 
> > This is still pretty kvm / arch relevant patch that needs some reviews.
> > 
> > I wonder do we really need this - we could fetch ramblock mapping
> > (e.g. hwaddr -> HVA) via HMP "info ramblock", and also dmesg shows process
> > ID + VA.  IIUC we have all below info already as long as we do some math
> > based on above.  Would that work too?
> 
> The HMP command "info ramblock" is implemented with the ram_block_format()
> function which returns a message buffer built with a string for each
> ramblock (protected by the RCU_READ_LOCK_GUARD). Our new function copies a
> struct with the necessary information.
> 
> Relaying on the buffer format to retrieve the information doesn't seem
> reasonable, and more importantly, this buffer doesn't provide all the needed
> data, like fd and fd_offset.
> 
> I would say that ram_block_format() and qemu_ram_block_info_from_addr()
> serve 2 different goals.
> 
> (a reimplementation of ram_block_format() with an adapted version of
> qemu_ram_block_info_from_addr() taking the extra information needed could be
> doable for example, but may not be worth doing for now)

IIUC admin should be aware of fd_offset because the admin should be fully
aware of the start offset of FDs to specify in qemu cmdlines, or in
Libvirt. But yes, we can always add fd_offset into ram_block_format() if
it's helpful.

Besides, the existing issues on this patch:

  - From outcome of this patch, it introduces one ramblock API (which is ok
    to me, so far), to do some error_report()s.  It looks pretty much for
    debugging rather than something serious (e.g. report via QMP queries,
    QMP events etc.).  From debug POV, I still don't see why this is
    needed.. per discussed above.

  - From merge POV, this patch isn't a pure memory change, so I'll need to
    get ack from other maintainers, at least that should be how it works..

I feel like when hwpoison becomes a serious topic, we need some more
serious reporting facility than error reports.  So that we could have this
as separate topic to be revisited.  It might speed up your prior patches
from not being blocked on this.

Thanks,
William Roche Feb. 7, 2025, 6:02 p.m. UTC | #4
On 2/5/25 18:07, Peter Xu wrote:
> On Wed, Feb 05, 2025 at 05:27:13PM +0100, William Roche wrote:
>> [...]
>> The HMP command "info ramblock" is implemented with the ram_block_format()
>> function which returns a message buffer built with a string for each
>> ramblock (protected by the RCU_READ_LOCK_GUARD). Our new function copies a
>> struct with the necessary information.
>>
>> Relaying on the buffer format to retrieve the information doesn't seem
>> reasonable, and more importantly, this buffer doesn't provide all the needed
>> data, like fd and fd_offset.
>>
>> I would say that ram_block_format() and qemu_ram_block_info_from_addr()
>> serve 2 different goals.
>>
>> (a reimplementation of ram_block_format() with an adapted version of
>> qemu_ram_block_info_from_addr() taking the extra information needed could be
>> doable for example, but may not be worth doing for now)
> 
> IIUC admin should be aware of fd_offset because the admin should be fully
> aware of the start offset of FDs to specify in qemu cmdlines, or in
> Libvirt. But yes, we can always add fd_offset into ram_block_format() if
> it's helpful.
> 
> Besides, the existing issues on this patch:
> 
>    - From outcome of this patch, it introduces one ramblock API (which is ok
>      to me, so far), to do some error_report()s.  It looks pretty much for
>      debugging rather than something serious (e.g. report via QMP queries,
>      QMP events etc.).  From debug POV, I still don't see why this is
>      needed.. per discussed above.

The reason why I want to inform the user of a large memory failure more 
specifically than a standard sized page loss is because of the 
significant behavior difference: Our current implementation can 
transparently handle many situations without necessarily leading the VM 
to a crash. But when it comes to large pages, there is no mechanism to 
inform the VM of a large memory loss, and usually this situation leads 
the VM to crash, and can also generate some weird situations like qemu 
itself crashing or a loop of errors, for example.

So having a message informing of such a memory loss can help to 
understand a more radical VM or qemu behavior -- it increases the 
diagnosability of our code.

To verify that a SIGBUS appeared because of a large page loss, we 
currently need to verify the targeted memory block backend page_size.
We should usually get this information from the SIGBUS siginfo data 
(with a si_addr_lsb field giving an indication of the page size) but a 
KVM weakness with a hardcoded si_addr_lsb=PAGE_SHIFT value in the SIGBUS 
siginfo returned from the kernel prevents that: See 
kvm_send_hwpoison_signal() function.

So I first wrote a small API addition called 
qemu_ram_pagesize_from_addr() to retrieve only this page_size value from 
the impacted address; and later on, this function turned into the richer 
qemu_ram_block_info_from_addr() function to have the generated messages 
match the existing memory messages as rightly requested by David.

So the main reason is a KVM "weakness" with kvm_send_hwpoison_signal(), 
and the second reason is to have richer error messages.



>    - From merge POV, this patch isn't a pure memory change, so I'll need to
>      get ack from other maintainers, at least that should be how it works..

I agree :)

> 
> I feel like when hwpoison becomes a serious topic, we need some more
> serious reporting facility than error reports.  So that we could have this
> as separate topic to be revisited.  It might speed up your prior patches
> from not being blocked on this.

I explained why I think that error messages are important, but I don't 
want to get blocked on fixing the hugepage memory recovery because of that.

If you think that not displaying a specific message for large page loss 
can help to get the recovery fixed, than I can change my proposal to do so.

Early next week, I'll send a simplified version of my first 3 patches 
without this specific messages and without the preallocation handling in 
all remap cases, so you can evaluate this possibility.

Thank again for your feedback
William.
Peter Xu Feb. 10, 2025, 4:48 p.m. UTC | #5
On Fri, Feb 07, 2025 at 07:02:22PM +0100, William Roche wrote:
> On 2/5/25 18:07, Peter Xu wrote:
> > On Wed, Feb 05, 2025 at 05:27:13PM +0100, William Roche wrote:
> > > [...]
> > > The HMP command "info ramblock" is implemented with the ram_block_format()
> > > function which returns a message buffer built with a string for each
> > > ramblock (protected by the RCU_READ_LOCK_GUARD). Our new function copies a
> > > struct with the necessary information.
> > > 
> > > Relaying on the buffer format to retrieve the information doesn't seem
> > > reasonable, and more importantly, this buffer doesn't provide all the needed
> > > data, like fd and fd_offset.
> > > 
> > > I would say that ram_block_format() and qemu_ram_block_info_from_addr()
> > > serve 2 different goals.
> > > 
> > > (a reimplementation of ram_block_format() with an adapted version of
> > > qemu_ram_block_info_from_addr() taking the extra information needed could be
> > > doable for example, but may not be worth doing for now)
> > 
> > IIUC admin should be aware of fd_offset because the admin should be fully
> > aware of the start offset of FDs to specify in qemu cmdlines, or in
> > Libvirt. But yes, we can always add fd_offset into ram_block_format() if
> > it's helpful.
> > 
> > Besides, the existing issues on this patch:
> > 
> >    - From outcome of this patch, it introduces one ramblock API (which is ok
> >      to me, so far), to do some error_report()s.  It looks pretty much for
> >      debugging rather than something serious (e.g. report via QMP queries,
> >      QMP events etc.).  From debug POV, I still don't see why this is
> >      needed.. per discussed above.
> 
> The reason why I want to inform the user of a large memory failure more
> specifically than a standard sized page loss is because of the significant
> behavior difference: Our current implementation can transparently handle
> many situations without necessarily leading the VM to a crash. But when it
> comes to large pages, there is no mechanism to inform the VM of a large
> memory loss, and usually this situation leads the VM to crash, and can also
> generate some weird situations like qemu itself crashing or a loop of
> errors, for example.
> 
> So having a message informing of such a memory loss can help to understand a
> more radical VM or qemu behavior -- it increases the diagnosability of our
> code.
> 
> To verify that a SIGBUS appeared because of a large page loss, we currently
> need to verify the targeted memory block backend page_size.
> We should usually get this information from the SIGBUS siginfo data (with a
> si_addr_lsb field giving an indication of the page size) but a KVM weakness
> with a hardcoded si_addr_lsb=PAGE_SHIFT value in the SIGBUS siginfo returned
> from the kernel prevents that: See kvm_send_hwpoison_signal() function.
> 
> So I first wrote a small API addition called qemu_ram_pagesize_from_addr()
> to retrieve only this page_size value from the impacted address; and later
> on, this function turned into the richer qemu_ram_block_info_from_addr()
> function to have the generated messages match the existing memory messages
> as rightly requested by David.
> 
> So the main reason is a KVM "weakness" with kvm_send_hwpoison_signal(), and
> the second reason is to have richer error messages.

This seems true, and I also remember something when I looked at this
previously but maybe nobody tried to fix it.  ARM seems to be correct on
that field, otoh.

Is it possible we fix KVM on x86?

kvm_handle_error_pfn() has the fault context, so IIUC it should be able to
figure that out too like what ARM does (with get_vma_page_shift()).

> 
> 
> 
> >    - From merge POV, this patch isn't a pure memory change, so I'll need to
> >      get ack from other maintainers, at least that should be how it works..
> 
> I agree :)
> 
> > 
> > I feel like when hwpoison becomes a serious topic, we need some more
> > serious reporting facility than error reports.  So that we could have this
> > as separate topic to be revisited.  It might speed up your prior patches
> > from not being blocked on this.
> 
> I explained why I think that error messages are important, but I don't want
> to get blocked on fixing the hugepage memory recovery because of that.

What is the major benefit of reporting in QEMU's stderr in this case?

For example, how should we consume the error reports that this patch
introduces?  Is it still for debugging purpose?

I agree it's always better to dump something in QEMU when such happened,
but IIUC what I mentioned above (by monitoring QEMU ramblock setups, and
monitor host dmesg on any vaddr reported hwpoison) should also allow anyone
to deduce the page size of affected vaddr, especially if it's for debugging
purpose.  However I could possibly have missed the goal here..

> 
> If you think that not displaying a specific message for large page loss can
> help to get the recovery fixed, than I can change my proposal to do so.
> 
> Early next week, I'll send a simplified version of my first 3 patches
> without this specific messages and without the preallocation handling in all
> remap cases, so you can evaluate this possibility.

Yes IMHO it'll always be helpful to separate it if possible.

Thanks,
William Roche Feb. 11, 2025, 9:22 p.m. UTC | #6
On 2/10/25 17:48, Peter Xu wrote:
> On Fri, Feb 07, 2025 at 07:02:22PM +0100, William Roche wrote:
>> [...]
>> So the main reason is a KVM "weakness" with kvm_send_hwpoison_signal(), and
>> the second reason is to have richer error messages.
> 
> This seems true, and I also remember something when I looked at this
> previously but maybe nobody tried to fix it.  ARM seems to be correct on
> that field, otoh.
> 
> Is it possible we fix KVM on x86?

Yes, very probably, and it would be a kernel fix.
This kernel modification would be needed to run on the hypervisor first 
to influence a new code in qemu able to use the SIGBUS siginfo 
information and identify the size of the page impacted (instead of using 
an internal addition to kvm API).
But this mechanism could help to generate a large page memory error 
specific message on SIGBUS receiving.


>>>
>>> I feel like when hwpoison becomes a serious topic, we need some more
>>> serious reporting facility than error reports.  So that we could have this
>>> as separate topic to be revisited.  It might speed up your prior patches
>>> from not being blocked on this.
>>
>> I explained why I think that error messages are important, but I don't want
>> to get blocked on fixing the hugepage memory recovery because of that.
> 
> What is the major benefit of reporting in QEMU's stderr in this case?

Such messages can be collected into VM specific log file, as any other 
error_report() message, like the existing x86 error injection messages 
reported by Qemu.
This messages should help the administrator to better understand the 
behavior of the VM.


> For example, how should we consume the error reports that this patch
> introduces?  Is it still for debugging purpose?

Its not only debugging, but it's a trace of a significant event that can 
have major consequences on the VM.

> 
> I agree it's always better to dump something in QEMU when such happened,
> but IIUC what I mentioned above (by monitoring QEMU ramblock setups, and
> monitor host dmesg on any vaddr reported hwpoison) should also allow anyone
> to deduce the page size of affected vaddr, especially if it's for debugging
> purpose.  However I could possibly have missed the goal here..

You're right that knowing the address, the administrator can deduce what 
memory area was impacted and the associated page size. But the goal of 
these large page specific messages was to give details on the event type 
and immediately qualify the consequences.
Using large pages can also have drawbacks, and a large page specific 
message on memory error makes that more obvious !  Not only a debug msg, 
but an indication that the VM lost an unusually large amount of its memory.

>>
>> If you think that not displaying a specific message for large page loss can
>> help to get the recovery fixed, than I can change my proposal to do so.
>>
>> Early next week, I'll send a simplified version of my first 3 patches
>> without this specific messages and without the preallocation handling in all
>> remap cases, so you can evaluate this possibility.
> 
> Yes IMHO it'll always be helpful to separate it if possible.

I'm sending now a v8 version, without the specific messages and the 
remap notification. It should fix the main recovery bug we currently 
have. More messages and a notification dealing with pre-allocation can 
be added in a second step.

Please let me know if this v8 version can be integrated without the 
prealloc and specific messages ?

Thanks,
William.
Peter Xu Feb. 11, 2025, 9:45 p.m. UTC | #7
On Tue, Feb 11, 2025 at 10:22:38PM +0100, William Roche wrote:
> On 2/10/25 17:48, Peter Xu wrote:
> > On Fri, Feb 07, 2025 at 07:02:22PM +0100, William Roche wrote:
> > > [...]
> > > So the main reason is a KVM "weakness" with kvm_send_hwpoison_signal(), and
> > > the second reason is to have richer error messages.
> > 
> > This seems true, and I also remember something when I looked at this
> > previously but maybe nobody tried to fix it.  ARM seems to be correct on
> > that field, otoh.
> > 
> > Is it possible we fix KVM on x86?
> 
> Yes, very probably, and it would be a kernel fix.
> This kernel modification would be needed to run on the hypervisor first to
> influence a new code in qemu able to use the SIGBUS siginfo information and
> identify the size of the page impacted (instead of using an internal
> addition to kvm API).
> But this mechanism could help to generate a large page memory error specific
> message on SIGBUS receiving.

Yes, QEMU should probably better be able to work on both old/new kernels,
even if this will be fixed.

> 
> 
> > > > 
> > > > I feel like when hwpoison becomes a serious topic, we need some more
> > > > serious reporting facility than error reports.  So that we could have this
> > > > as separate topic to be revisited.  It might speed up your prior patches
> > > > from not being blocked on this.
> > > 
> > > I explained why I think that error messages are important, but I don't want
> > > to get blocked on fixing the hugepage memory recovery because of that.
> > 
> > What is the major benefit of reporting in QEMU's stderr in this case?
> 
> Such messages can be collected into VM specific log file, as any other
> error_report() message, like the existing x86 error injection messages
> reported by Qemu.
> This messages should help the administrator to better understand the
> behavior of the VM.

I'll still put "better understand the behavior of VM" into debugging
category. :)

But I agree such can be important information.  That's also why I was
curious whether it should be something like a QMP event instead.  That's a
much formal way of sending important messages.

> 
> 
> > For example, how should we consume the error reports that this patch
> > introduces?  Is it still for debugging purpose?
> 
> Its not only debugging, but it's a trace of a significant event that can
> have major consequences on the VM.
> 
> > 
> > I agree it's always better to dump something in QEMU when such happened,
> > but IIUC what I mentioned above (by monitoring QEMU ramblock setups, and
> > monitor host dmesg on any vaddr reported hwpoison) should also allow anyone
> > to deduce the page size of affected vaddr, especially if it's for debugging
> > purpose.  However I could possibly have missed the goal here..
> 
> You're right that knowing the address, the administrator can deduce what
> memory area was impacted and the associated page size. But the goal of these
> large page specific messages was to give details on the event type and
> immediately qualify the consequences.
> Using large pages can also have drawbacks, and a large page specific message
> on memory error makes that more obvious !  Not only a debug msg, but an
> indication that the VM lost an unusually large amount of its memory.
> 
> > > 
> > > If you think that not displaying a specific message for large page loss can
> > > help to get the recovery fixed, than I can change my proposal to do so.
> > > 
> > > Early next week, I'll send a simplified version of my first 3 patches
> > > without this specific messages and without the preallocation handling in all
> > > remap cases, so you can evaluate this possibility.
> > 
> > Yes IMHO it'll always be helpful to separate it if possible.
> 
> I'm sending now a v8 version, without the specific messages and the remap
> notification. It should fix the main recovery bug we currently have. More
> messages and a notification dealing with pre-allocation can be added in a
> second step.
> 
> Please let me know if this v8 version can be integrated without the prealloc
> and specific messages ?

IMHO fixing hugetlb page is still a progress on its own, even without any
added error message, or proactive allocation during reset.

One issue is the v8 still contains patch 3 which is for ARM kvm.. You may
need to post it separately for ARM maintainers to review & collect.  I'll
be able to queue patch 1-2.

Thanks,
diff mbox series

Patch

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index f89568bfa3..9a0d970ce1 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -1296,6 +1296,24 @@  static void kvm_unpoison_all(void *param)
 void kvm_hwpoison_page_add(ram_addr_t ram_addr)
 {
     HWPoisonPage *page;
+    struct RAMBlockInfo rb_info;
+
+    if (qemu_ram_block_info_from_addr(ram_addr, &rb_info)) {
+        size_t ps = rb_info.page_size;
+
+        if (ps > TARGET_PAGE_SIZE) {
+            uint64_t offset = QEMU_ALIGN_DOWN(ram_addr - rb_info.offset, ps);
+
+            if (rb_info.fd >= 0) {
+                error_report("Memory Error on large page from %s:%" PRIx64
+                             "+%" PRIx64 " +%zx", rb_info.idstr, offset,
+                             rb_info.fd_offset, ps);
+            } else {
+                error_report("Memory Error on large page from %s:%" PRIx64
+                            " +%zx", rb_info.idstr, offset, ps);
+            }
+        }
+    }
 
     QLIST_FOREACH(page, &hwpoison_page_list, list) {
         if (page->ram_addr == ram_addr) {
diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h
index 3771b2130c..190bd4f34a 100644
--- a/include/exec/cpu-common.h
+++ b/include/exec/cpu-common.h
@@ -110,6 +110,16 @@  int qemu_ram_get_fd(RAMBlock *rb);
 size_t qemu_ram_pagesize(RAMBlock *block);
 size_t qemu_ram_pagesize_largest(void);
 
+struct RAMBlockInfo {
+    char idstr[256];
+    ram_addr_t offset;
+    int fd;
+    uint64_t fd_offset;
+    size_t page_size;
+};
+bool qemu_ram_block_info_from_addr(ram_addr_t ram_addr,
+                                   struct RAMBlockInfo *block);
+
 /**
  * cpu_address_space_init:
  * @cpu: CPU to add this address space to
diff --git a/system/physmem.c b/system/physmem.c
index e8ff930bc9..686f569270 100644
--- a/system/physmem.c
+++ b/system/physmem.c
@@ -1678,6 +1678,28 @@  size_t qemu_ram_pagesize_largest(void)
     return largest;
 }
 
+/* Copy RAMBlock information associated to the given ram_addr location */
+bool qemu_ram_block_info_from_addr(ram_addr_t ram_addr,
+                                   struct RAMBlockInfo *b_info)
+{
+    RAMBlock *rb;
+
+    assert(b_info);
+
+    RCU_READ_LOCK_GUARD();
+    rb =  qemu_get_ram_block(ram_addr);
+    if (!rb) {
+        return false;
+    }
+
+    pstrcat(b_info->idstr, sizeof(b_info->idstr), rb->idstr);
+    b_info->offset = rb->offset;
+    b_info->fd = rb->fd;
+    b_info->fd_offset = rb->fd_offset;
+    b_info->page_size = rb->page_size;
+    return true;
+}
+
 static int memory_try_enable_merging(void *addr, size_t len)
 {
     if (!machine_mem_merge(current_machine)) {
diff --git a/target/arm/kvm.c b/target/arm/kvm.c
index da30bdbb23..d9dedc6d74 100644
--- a/target/arm/kvm.c
+++ b/target/arm/kvm.c
@@ -2389,6 +2389,9 @@  void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr)
                 kvm_cpu_synchronize_state(c);
                 if (!acpi_ghes_memory_errors(ACPI_HEST_SRC_ID_SEA, paddr)) {
                     kvm_inject_arm_sea(c);
+                    error_report("Guest Memory Error at QEMU addr %p and "
+                        "GUEST addr 0x%" HWADDR_PRIx " of type %s injected",
+                        addr, paddr, "BUS_MCEERR_AR");
                 } else {
                     error_report("failed to record the error");
                     abort();