diff mbox series

[v2,1/7] accel/kvm: Keep track of the HWPoisonPage page_size

Message ID 20241107102126.2183152-2-william.roche@oracle.com (mailing list archive)
State New
Headers show
Series hugetlbfs memory HW error fixes | expand

Commit Message

William Roche Nov. 7, 2024, 10:21 a.m. UTC
From: William Roche <william.roche@oracle.com>

When a memory page is added to the hwpoison_page_list, include
the page size information.  This size is the backend real page
size. To better deal with hugepages, we create a single entry
for the entire page.

Signed-off-by: William Roche <william.roche@oracle.com>
---
 accel/kvm/kvm-all.c       |  8 +++++++-
 include/exec/cpu-common.h |  1 +
 system/physmem.c          | 13 +++++++++++++
 3 files changed, 21 insertions(+), 1 deletion(-)

Comments

David Hildenbrand Nov. 12, 2024, 10:30 a.m. UTC | #1
On 07.11.24 11:21, “William Roche wrote:
> From: William Roche <william.roche@oracle.com>
> 
> When a memory page is added to the hwpoison_page_list, include
> the page size information.  This size is the backend real page
> size. To better deal with hugepages, we create a single entry
> for the entire page.
> 
> Signed-off-by: William Roche <william.roche@oracle.com>
> ---
>   accel/kvm/kvm-all.c       |  8 +++++++-
>   include/exec/cpu-common.h |  1 +
>   system/physmem.c          | 13 +++++++++++++
>   3 files changed, 21 insertions(+), 1 deletion(-)
> 
> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
> index 801cff16a5..6dd06f5edf 100644
> --- a/accel/kvm/kvm-all.c
> +++ b/accel/kvm/kvm-all.c
> @@ -1266,6 +1266,7 @@ int kvm_vm_check_extension(KVMState *s, unsigned int extension)
>    */
>   typedef struct HWPoisonPage {
>       ram_addr_t ram_addr;
> +    size_t     page_size;
>       QLIST_ENTRY(HWPoisonPage) list;
>   } HWPoisonPage;
>   
> @@ -1278,7 +1279,7 @@ static void kvm_unpoison_all(void *param)
>   
>       QLIST_FOREACH_SAFE(page, &hwpoison_page_list, list, next_page) {
>           QLIST_REMOVE(page, list);
> -        qemu_ram_remap(page->ram_addr, TARGET_PAGE_SIZE);
> +        qemu_ram_remap(page->ram_addr, page->page_size);
>           g_free(page);

I'm curious, can't we simply drop the size parameter from qemu_ram_remap()
completely and determine the page size internally from the RAMBlock that
we are looking up already?

This way, we avoid yet another lookup in qemu_ram_pagesize_from_addr(),
and can just handle it completely in qemu_ram_remap().

In particular, to be future proof, we should also align the offset down to
the pagesize.

I'm thinking about something like this:

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index 801cff16a5..8a47aa7258 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -1278,7 +1278,7 @@ static void kvm_unpoison_all(void *param)
  
      QLIST_FOREACH_SAFE(page, &hwpoison_page_list, list, next_page) {
          QLIST_REMOVE(page, list);
-        qemu_ram_remap(page->ram_addr, TARGET_PAGE_SIZE);
+        qemu_ram_remap(page->ram_addr);
          g_free(page);
      }
  }
diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h
index 638dc806a5..50a829d31f 100644
--- a/include/exec/cpu-common.h
+++ b/include/exec/cpu-common.h
@@ -67,7 +67,7 @@ typedef uintptr_t ram_addr_t;
  
  /* memory API */
  
-void qemu_ram_remap(ram_addr_t addr, ram_addr_t length);
+void qemu_ram_remap(ram_addr_t addr);
  /* This should not be used by devices.  */
  ram_addr_t qemu_ram_addr_from_host(void *ptr);
  ram_addr_t qemu_ram_addr_from_host_nofail(void *ptr);
diff --git a/system/physmem.c b/system/physmem.c
index dc1db3a384..5f19bec089 100644
--- a/system/physmem.c
+++ b/system/physmem.c
@@ -2167,10 +2167,10 @@ void qemu_ram_free(RAMBlock *block)
  }
  
  #ifndef _WIN32
-void qemu_ram_remap(ram_addr_t addr, ram_addr_t length)
+void qemu_ram_remap(ram_addr_t addr)
  {
      RAMBlock *block;
-    ram_addr_t offset;
+    ram_addr_t offset, length;
      int flags;
      void *area, *vaddr;
      int prot;
@@ -2178,6 +2178,10 @@ void qemu_ram_remap(ram_addr_t addr, ram_addr_t length)
      RAMBLOCK_FOREACH(block) {
          offset = addr - block->offset;
          if (offset < block->max_length) {
+            /* Respect the pagesize of our RAMBlock. */
+            offset = QEMU_ALIGN_DOWN(offset, qemu_ram_pagesize(block));
+            length = qemu_ram_pagesize(block);
+
              vaddr = ramblock_ptr(block, offset);
              if (block->flags & RAM_PREALLOC) {
                  ;
@@ -2206,6 +2210,8 @@ void qemu_ram_remap(ram_addr_t addr, ram_addr_t length)
                  memory_try_enable_merging(vaddr, length);
                  qemu_ram_setup_dump(vaddr, length);
              }
+
+            break;
          }
      }
  }
William Roche Nov. 12, 2024, 6:17 p.m. UTC | #2
On 11/12/24 11:30, David Hildenbrand wrote:
> On 07.11.24 11:21, “William Roche wrote:
>> From: William Roche <william.roche@oracle.com>
>>
>> When a memory page is added to the hwpoison_page_list, include
>> the page size information.  This size is the backend real page
>> size. To better deal with hugepages, we create a single entry
>> for the entire page.
>>
>> Signed-off-by: William Roche <william.roche@oracle.com>
>> ---
>>   accel/kvm/kvm-all.c       |  8 +++++++-
>>   include/exec/cpu-common.h |  1 +
>>   system/physmem.c          | 13 +++++++++++++
>>   3 files changed, 21 insertions(+), 1 deletion(-)
>>
>> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
>> index 801cff16a5..6dd06f5edf 100644
>> --- a/accel/kvm/kvm-all.c
>> +++ b/accel/kvm/kvm-all.c
>> @@ -1266,6 +1266,7 @@ int kvm_vm_check_extension(KVMState *s, unsigned 
>> int extension)
>>    */
>>   typedef struct HWPoisonPage {
>>       ram_addr_t ram_addr;
>> +    size_t     page_size;
>>       QLIST_ENTRY(HWPoisonPage) list;
>>   } HWPoisonPage;
>> @@ -1278,7 +1279,7 @@ static void kvm_unpoison_all(void *param)
>>       QLIST_FOREACH_SAFE(page, &hwpoison_page_list, list, next_page) {
>>           QLIST_REMOVE(page, list);
>> -        qemu_ram_remap(page->ram_addr, TARGET_PAGE_SIZE);
>> +        qemu_ram_remap(page->ram_addr, page->page_size);
>>           g_free(page);
> 
> I'm curious, can't we simply drop the size parameter from qemu_ram_remap()
> completely and determine the page size internally from the RAMBlock that
> we are looking up already?
> 
> This way, we avoid yet another lookup in qemu_ram_pagesize_from_addr(),
> and can just handle it completely in qemu_ram_remap().
> 
> In particular, to be future proof, we should also align the offset down to
> the pagesize.
> 
> I'm thinking about something like this:
> 
> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
> index 801cff16a5..8a47aa7258 100644
> --- a/accel/kvm/kvm-all.c
> +++ b/accel/kvm/kvm-all.c
> @@ -1278,7 +1278,7 @@ static void kvm_unpoison_all(void *param)
> 
>       QLIST_FOREACH_SAFE(page, &hwpoison_page_list, list, next_page) {
>           QLIST_REMOVE(page, list);
> -        qemu_ram_remap(page->ram_addr, TARGET_PAGE_SIZE);
> +        qemu_ram_remap(page->ram_addr);
>           g_free(page);
>       }
>   }
> diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h
> index 638dc806a5..50a829d31f 100644
> --- a/include/exec/cpu-common.h
> +++ b/include/exec/cpu-common.h
> @@ -67,7 +67,7 @@ typedef uintptr_t ram_addr_t;
> 
>   /* memory API */
> 
> -void qemu_ram_remap(ram_addr_t addr, ram_addr_t length);
> +void qemu_ram_remap(ram_addr_t addr);
>   /* This should not be used by devices.  */
>   ram_addr_t qemu_ram_addr_from_host(void *ptr);
>   ram_addr_t qemu_ram_addr_from_host_nofail(void *ptr);
> diff --git a/system/physmem.c b/system/physmem.c
> index dc1db3a384..5f19bec089 100644
> --- a/system/physmem.c
> +++ b/system/physmem.c
> @@ -2167,10 +2167,10 @@ void qemu_ram_free(RAMBlock *block)
>   }
> 
>   #ifndef _WIN32
> -void qemu_ram_remap(ram_addr_t addr, ram_addr_t length)
> +void qemu_ram_remap(ram_addr_t addr)
>   {
>       RAMBlock *block;
> -    ram_addr_t offset;
> +    ram_addr_t offset, length;
>       int flags;
>       void *area, *vaddr;
>       int prot;
> @@ -2178,6 +2178,10 @@ void qemu_ram_remap(ram_addr_t addr, ram_addr_t 
> length)
>       RAMBLOCK_FOREACH(block) {
>           offset = addr - block->offset;
>           if (offset < block->max_length) {
> +            /* Respect the pagesize of our RAMBlock. */
> +            offset = QEMU_ALIGN_DOWN(offset, qemu_ram_pagesize(block));
> +            length = qemu_ram_pagesize(block);
> +
>               vaddr = ramblock_ptr(block, offset);
>               if (block->flags & RAM_PREALLOC) {
>                   ;
> @@ -2206,6 +2210,8 @@ void qemu_ram_remap(ram_addr_t addr, ram_addr_t 
> length)
>                   memory_try_enable_merging(vaddr, length);
>                   qemu_ram_setup_dump(vaddr, length);
>               }
> +
> +            break;
>           }
>       }
>   }
> 
> 


Yes this is a working possibility, and as you say it would provide the 
advantage to avoid a size lookup (needed because the kernel siginfo can 
be incorrect) and avoid tracking the poisoned pages size, with the 
addresses.

But if we want to keep the information about the loss of a large page 
(which I think is useful) we would have to introduce the page size 
lookup when adding the page to the poison list. So according to me, 
keeping track of the page size and reusing it on remap isn't so bad. But 
if you prefer that we don't track the page size and do a lookup on page 
insert into the poison list and another in qemu_ram_remap(), of course 
we can do that.

There is also something to consider about the future: we'll also have to 
deal with migration of VM that have been impacted by a memory error. And 
knowing about the poisoned pages size could be useful too. But this is 
another topic...

I would vote to keep this size tracking.
David Hildenbrand Nov. 12, 2024, 9:35 p.m. UTC | #3
On 12.11.24 19:17, William Roche wrote:
> On 11/12/24 11:30, David Hildenbrand wrote:
>> On 07.11.24 11:21, “William Roche wrote:
>>> From: William Roche <william.roche@oracle.com>
>>>
>>> When a memory page is added to the hwpoison_page_list, include
>>> the page size information.  This size is the backend real page
>>> size. To better deal with hugepages, we create a single entry
>>> for the entire page.
>>>
>>> Signed-off-by: William Roche <william.roche@oracle.com>
>>> ---
>>>    accel/kvm/kvm-all.c       |  8 +++++++-
>>>    include/exec/cpu-common.h |  1 +
>>>    system/physmem.c          | 13 +++++++++++++
>>>    3 files changed, 21 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
>>> index 801cff16a5..6dd06f5edf 100644
>>> --- a/accel/kvm/kvm-all.c
>>> +++ b/accel/kvm/kvm-all.c
>>> @@ -1266,6 +1266,7 @@ int kvm_vm_check_extension(KVMState *s, unsigned
>>> int extension)
>>>     */
>>>    typedef struct HWPoisonPage {
>>>        ram_addr_t ram_addr;
>>> +    size_t     page_size;
>>>        QLIST_ENTRY(HWPoisonPage) list;
>>>    } HWPoisonPage;
>>> @@ -1278,7 +1279,7 @@ static void kvm_unpoison_all(void *param)
>>>        QLIST_FOREACH_SAFE(page, &hwpoison_page_list, list, next_page) {
>>>            QLIST_REMOVE(page, list);
>>> -        qemu_ram_remap(page->ram_addr, TARGET_PAGE_SIZE);
>>> +        qemu_ram_remap(page->ram_addr, page->page_size);
>>>            g_free(page);
>>
>> I'm curious, can't we simply drop the size parameter from qemu_ram_remap()
>> completely and determine the page size internally from the RAMBlock that
>> we are looking up already?
>>
>> This way, we avoid yet another lookup in qemu_ram_pagesize_from_addr(),
>> and can just handle it completely in qemu_ram_remap().
>>
>> In particular, to be future proof, we should also align the offset down to
>> the pagesize.
>>
>> I'm thinking about something like this:
>>
>> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
>> index 801cff16a5..8a47aa7258 100644
>> --- a/accel/kvm/kvm-all.c
>> +++ b/accel/kvm/kvm-all.c
>> @@ -1278,7 +1278,7 @@ static void kvm_unpoison_all(void *param)
>>
>>        QLIST_FOREACH_SAFE(page, &hwpoison_page_list, list, next_page) {
>>            QLIST_REMOVE(page, list);
>> -        qemu_ram_remap(page->ram_addr, TARGET_PAGE_SIZE);
>> +        qemu_ram_remap(page->ram_addr);
>>            g_free(page);
>>        }
>>    }
>> diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h
>> index 638dc806a5..50a829d31f 100644
>> --- a/include/exec/cpu-common.h
>> +++ b/include/exec/cpu-common.h
>> @@ -67,7 +67,7 @@ typedef uintptr_t ram_addr_t;
>>
>>    /* memory API */
>>
>> -void qemu_ram_remap(ram_addr_t addr, ram_addr_t length);
>> +void qemu_ram_remap(ram_addr_t addr);
>>    /* This should not be used by devices.  */
>>    ram_addr_t qemu_ram_addr_from_host(void *ptr);
>>    ram_addr_t qemu_ram_addr_from_host_nofail(void *ptr);
>> diff --git a/system/physmem.c b/system/physmem.c
>> index dc1db3a384..5f19bec089 100644
>> --- a/system/physmem.c
>> +++ b/system/physmem.c
>> @@ -2167,10 +2167,10 @@ void qemu_ram_free(RAMBlock *block)
>>    }
>>
>>    #ifndef _WIN32
>> -void qemu_ram_remap(ram_addr_t addr, ram_addr_t length)
>> +void qemu_ram_remap(ram_addr_t addr)
>>    {
>>        RAMBlock *block;
>> -    ram_addr_t offset;
>> +    ram_addr_t offset, length;
>>        int flags;
>>        void *area, *vaddr;
>>        int prot;
>> @@ -2178,6 +2178,10 @@ void qemu_ram_remap(ram_addr_t addr, ram_addr_t
>> length)
>>        RAMBLOCK_FOREACH(block) {
>>            offset = addr - block->offset;
>>            if (offset < block->max_length) {
>> +            /* Respect the pagesize of our RAMBlock. */
>> +            offset = QEMU_ALIGN_DOWN(offset, qemu_ram_pagesize(block));
>> +            length = qemu_ram_pagesize(block);
>> +
>>                vaddr = ramblock_ptr(block, offset);
>>                if (block->flags & RAM_PREALLOC) {
>>                    ;
>> @@ -2206,6 +2210,8 @@ void qemu_ram_remap(ram_addr_t addr, ram_addr_t
>> length)
>>                    memory_try_enable_merging(vaddr, length);
>>                    qemu_ram_setup_dump(vaddr, length);
>>                }
>> +
>> +            break;
>>            }
>>        }
>>    }
>>
>>
> 
> 
> Yes this is a working possibility, and as you say it would provide the
> advantage to avoid a size lookup (needed because the kernel siginfo can
> be incorrect) and avoid tracking the poisoned pages size, with the
> addresses.
 > > But if we want to keep the information about the loss of a large page
> (which I think is useful) we would have to introduce the page size
> lookup when adding the page to the poison list. So according to me,

Right, that would be independent of the remap logic.

What I dislike about qemu_ram_remap() is that it looks like we could be 
remapping a range that's possibly larger than a single page.

But it really only works on a single address, expanding that to the 
page. Passing in a length that crosses RAMBlocks would not work as 
expected ...

So I'd prefer if we let qemu_ram_remap() do exactly that ... remap a 
single page ...

> keeping track of the page size and reusing it on remap isn't so bad. But
> if you prefer that we don't track the page size and do a lookup on page
> insert into the poison list and another in qemu_ram_remap(), of course
> we can do that.

... and lookup the page size manually here if we really have to, for 
example to warn/trace errors.

 > > There is also something to consider about the future: we'll also 
have to
> deal with migration of VM that have been impacted by a memory error. And
> knowing about the poisoned pages size could be useful too. But this is
> another topic...

Yes, although the destination should be able to derive the same thing 
from the address I guess. We expect src and dst QEMU to use the same 
memory backing.
diff mbox series

Patch

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index 801cff16a5..6dd06f5edf 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -1266,6 +1266,7 @@  int kvm_vm_check_extension(KVMState *s, unsigned int extension)
  */
 typedef struct HWPoisonPage {
     ram_addr_t ram_addr;
+    size_t     page_size;
     QLIST_ENTRY(HWPoisonPage) list;
 } HWPoisonPage;
 
@@ -1278,7 +1279,7 @@  static void kvm_unpoison_all(void *param)
 
     QLIST_FOREACH_SAFE(page, &hwpoison_page_list, list, next_page) {
         QLIST_REMOVE(page, list);
-        qemu_ram_remap(page->ram_addr, TARGET_PAGE_SIZE);
+        qemu_ram_remap(page->ram_addr, page->page_size);
         g_free(page);
     }
 }
@@ -1286,6 +1287,10 @@  static void kvm_unpoison_all(void *param)
 void kvm_hwpoison_page_add(ram_addr_t ram_addr)
 {
     HWPoisonPage *page;
+    size_t sz = qemu_ram_pagesize_from_addr(ram_addr);
+
+    if (sz > TARGET_PAGE_SIZE)
+        ram_addr = ROUND_DOWN(ram_addr, sz);
 
     QLIST_FOREACH(page, &hwpoison_page_list, list) {
         if (page->ram_addr == ram_addr) {
@@ -1294,6 +1299,7 @@  void kvm_hwpoison_page_add(ram_addr_t ram_addr)
     }
     page = g_new(HWPoisonPage, 1);
     page->ram_addr = ram_addr;
+    page->page_size = sz;
     QLIST_INSERT_HEAD(&hwpoison_page_list, page, list);
 }
 
diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h
index 638dc806a5..8f8f7ad567 100644
--- a/include/exec/cpu-common.h
+++ b/include/exec/cpu-common.h
@@ -108,6 +108,7 @@  bool qemu_ram_is_named_file(RAMBlock *rb);
 int qemu_ram_get_fd(RAMBlock *rb);
 
 size_t qemu_ram_pagesize(RAMBlock *block);
+size_t qemu_ram_pagesize_from_addr(ram_addr_t addr);
 size_t qemu_ram_pagesize_largest(void);
 
 /**
diff --git a/system/physmem.c b/system/physmem.c
index dc1db3a384..750604d47d 100644
--- a/system/physmem.c
+++ b/system/physmem.c
@@ -1665,6 +1665,19 @@  size_t qemu_ram_pagesize(RAMBlock *rb)
     return rb->page_size;
 }
 
+/* Return backend real page size used for the given ram_addr. */
+size_t qemu_ram_pagesize_from_addr(ram_addr_t addr)
+{
+    RAMBlock *rb;
+
+    RCU_READ_LOCK_GUARD();
+    rb =  qemu_get_ram_block(addr);
+    if (!rb) {
+        return TARGET_PAGE_SIZE;
+    }
+    return qemu_ram_pagesize(rb);
+}
+
 /* Returns the largest size of page in use */
 size_t qemu_ram_pagesize_largest(void)
 {