Message ID | 20240510182926.763131-2-axelrasmussen@google.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | arch/fault: don't print logs for simulated poison errors | expand |
On Fri, May 10, 2024 at 11:29:26AM -0700, Axel Rasmussen wrote: > For real MCEs, various architectures print log messages when poisoned > memory is accessed (which results in a SIGBUS). These messages can be > important for users to understand the issue. > > On the other hand, we have two other cases: swapin errors and simulated > poisons via UFFDIO_POISON. These cases also result in SIGBUS, but they > aren't "real" hardware memory poisoning events, so we want to avoid > logging MCE error messages to dmesg for these events. This avoids > spamming the kernel log, and possibly drowning out real events with > these other cases. > > To identify this situation, add a new VM_FAULT_HWPOISON_SILENT flag. > This is expected to be set *in addition to* one of the existing > VM_FAULT_HWPOISON or VM_FAULT_HWPOISON_LARGE flags (which are mutually > exclusive). > > Reviewed-by: John Hubbard <jhubbard@nvidia.com> > Signed-off-by: Axel Rasmussen <axelrasmussen@google.com> Acked-by: Peter Xu <peterx@redhat.com> One nicpick below. > --- > arch/parisc/mm/fault.c | 7 +++++-- > arch/powerpc/mm/fault.c | 6 ++++-- > arch/x86/mm/fault.c | 6 ++++-- > include/linux/mm_types.h | 34 ++++++++++++++++++++-------------- > mm/hugetlb.c | 3 ++- > mm/memory.c | 2 +- > 6 files changed, 36 insertions(+), 22 deletions(-) > > diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c > index c39de84e98b0..6c5e8d6498bf 100644 > --- a/arch/parisc/mm/fault.c > +++ b/arch/parisc/mm/fault.c > @@ -400,9 +400,12 @@ void do_page_fault(struct pt_regs *regs, unsigned long code, > #ifdef CONFIG_MEMORY_FAILURE > if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) { > unsigned int lsb = 0; > - printk(KERN_ERR > + > + if (!(fault & VM_FAULT_HWPOISON_SILENT)) { > + pr_err( > "MCE: Killing %s:%d due to hardware memory corruption fault at %08lx\n", > - tsk->comm, tsk->pid, address); > + tsk->comm, tsk->pid, address); > + } > /* > * Either small page or large page may be poisoned. > * In other words, VM_FAULT_HWPOISON_LARGE and > diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c > index 215690452495..c43bb6193a80 100644 > --- a/arch/powerpc/mm/fault.c > +++ b/arch/powerpc/mm/fault.c > @@ -147,8 +147,10 @@ static int do_sigbus(struct pt_regs *regs, unsigned long address, > if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) { > unsigned int lsb = 0; /* shutup gcc */ > > - pr_err("MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n", > - current->comm, current->pid, address); > + if (!(fault & VM_FAULT_HWPOISON_SILENT)) { > + pr_err("MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n", > + current->comm, current->pid, address); > + } > > if (fault & VM_FAULT_HWPOISON_LARGE) > lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); > diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c > index 67b18adc75dd..9ae5cc6bd933 100644 > --- a/arch/x86/mm/fault.c > +++ b/arch/x86/mm/fault.c > @@ -964,9 +964,11 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, > struct task_struct *tsk = current; > unsigned lsb = 0; > > - pr_err( > + if (!(fault & VM_FAULT_HWPOISON_SILENT)) { > + pr_err( > "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n", > - tsk->comm, tsk->pid, address); > + tsk->comm, tsk->pid, address); > + } > if (fault & VM_FAULT_HWPOISON_LARGE) > lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); > if (fault & VM_FAULT_HWPOISON) > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h > index 24323c7d0bd4..7663a2725341 100644 > --- a/include/linux/mm_types.h > +++ b/include/linux/mm_types.h > @@ -1224,6 +1224,10 @@ typedef __bitwise unsigned int vm_fault_t; > * @VM_FAULT_HWPOISON_LARGE: Hit poisoned large page. Index encoded > * in upper bits > * @VM_FAULT_SIGSEGV: segmentation fault > + * @VM_FAULT_HWPOISON_SILENT Hit a poisoned pte marker, which should not be > + * logged to dmesg since it's something besides a > + * real hardware memory error (swapin error, > + * simulated poison via UFFDIO_POISON, etc.). IMHO we shouldn't mention that detail, but only state the effect which is to not report the event to syslog. There's no hard rule that a pte marker can't reflect a real page poison in the future even MCE. Actually I still remember most places don't care about the pfn in the hwpoison swap entry so maybe we can even do it? But that's another story regardless.. And also not report swapin error is, IMHO, only because arch errors said "MCE" in the error logs which may not apply here. Logically speaking swapin error should also be reported so admin knows better on why a proc is killed. Now it can still confuse the admin if it really happens, iiuc. > * @VM_FAULT_NOPAGE: ->fault installed the pte, not return page > * @VM_FAULT_LOCKED: ->fault locked the returned page > * @VM_FAULT_RETRY: ->fault blocked, must retry > @@ -1237,20 +1241,21 @@ typedef __bitwise unsigned int vm_fault_t; > * > */ > enum vm_fault_reason { > - VM_FAULT_OOM = (__force vm_fault_t)0x000001, > - VM_FAULT_SIGBUS = (__force vm_fault_t)0x000002, > - VM_FAULT_MAJOR = (__force vm_fault_t)0x000004, > - VM_FAULT_HWPOISON = (__force vm_fault_t)0x000010, > - VM_FAULT_HWPOISON_LARGE = (__force vm_fault_t)0x000020, > - VM_FAULT_SIGSEGV = (__force vm_fault_t)0x000040, > - VM_FAULT_NOPAGE = (__force vm_fault_t)0x000100, > - VM_FAULT_LOCKED = (__force vm_fault_t)0x000200, > - VM_FAULT_RETRY = (__force vm_fault_t)0x000400, > - VM_FAULT_FALLBACK = (__force vm_fault_t)0x000800, > - VM_FAULT_DONE_COW = (__force vm_fault_t)0x001000, > - VM_FAULT_NEEDDSYNC = (__force vm_fault_t)0x002000, > - VM_FAULT_COMPLETED = (__force vm_fault_t)0x004000, > - VM_FAULT_HINDEX_MASK = (__force vm_fault_t)0x0f0000, > + VM_FAULT_OOM = (__force vm_fault_t)0x000001, > + VM_FAULT_SIGBUS = (__force vm_fault_t)0x000002, > + VM_FAULT_MAJOR = (__force vm_fault_t)0x000004, > + VM_FAULT_HWPOISON = (__force vm_fault_t)0x000010, > + VM_FAULT_HWPOISON_LARGE = (__force vm_fault_t)0x000020, > + VM_FAULT_SIGSEGV = (__force vm_fault_t)0x000040, > + VM_FAULT_HWPOISON_SILENT = (__force vm_fault_t)0x000080, > + VM_FAULT_NOPAGE = (__force vm_fault_t)0x000100, > + VM_FAULT_LOCKED = (__force vm_fault_t)0x000200, > + VM_FAULT_RETRY = (__force vm_fault_t)0x000400, > + VM_FAULT_FALLBACK = (__force vm_fault_t)0x000800, > + VM_FAULT_DONE_COW = (__force vm_fault_t)0x001000, > + VM_FAULT_NEEDDSYNC = (__force vm_fault_t)0x002000, > + VM_FAULT_COMPLETED = (__force vm_fault_t)0x004000, > + VM_FAULT_HINDEX_MASK = (__force vm_fault_t)0x0f0000, > }; > > /* Encode hstate index for a hwpoisoned large page */ > @@ -1268,6 +1273,7 @@ enum vm_fault_reason { > { VM_FAULT_HWPOISON, "HWPOISON" }, \ > { VM_FAULT_HWPOISON_LARGE, "HWPOISON_LARGE" }, \ > { VM_FAULT_SIGSEGV, "SIGSEGV" }, \ > + { VM_FAULT_HWPOISON_SILENT, "HWPOISON_SILENT" }, \ > { VM_FAULT_NOPAGE, "NOPAGE" }, \ > { VM_FAULT_LOCKED, "LOCKED" }, \ > { VM_FAULT_RETRY, "RETRY" }, \ > diff --git a/mm/hugetlb.c b/mm/hugetlb.c > index 6be78e7d4f6e..91517cd7f44c 100644 > --- a/mm/hugetlb.c > +++ b/mm/hugetlb.c > @@ -6485,7 +6485,8 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, > pte_marker_get(pte_to_swp_entry(vmf.orig_pte)); > > if (marker & PTE_MARKER_POISONED) { > - ret = VM_FAULT_HWPOISON_LARGE | > + ret = VM_FAULT_HWPOISON_SILENT | > + VM_FAULT_HWPOISON_LARGE | > VM_FAULT_SET_HINDEX(hstate_index(h)); > goto out_mutex; > } > diff --git a/mm/memory.c b/mm/memory.c > index eea6e4984eae..721c0731cef2 100644 > --- a/mm/memory.c > +++ b/mm/memory.c > @@ -3938,7 +3938,7 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf) > > /* Higher priority than uffd-wp when data corrupted */ > if (marker & PTE_MARKER_POISONED) > - return VM_FAULT_HWPOISON; > + return VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_SILENT; > > if (pte_marker_entry_uffd_wp(entry)) > return pte_marker_handle_uffd_wp(vmf); > -- > 2.45.0.118.g7fe29c98d7-goog > >
On Fri, May 10, 2024 at 03:29:48PM -0400, Peter Xu wrote: > IMHO we shouldn't mention that detail, but only state the effect which is > to not report the event to syslog. > > There's no hard rule that a pte marker can't reflect a real page poison in > the future even MCE. Actually I still remember most places don't care > about the pfn in the hwpoison swap entry so maybe we can even do it? But > that's another story regardless.. But we should not use pte markers for real hwpoisons events (aka MCE), right? I mean, we do have the means to mark a page as hwpoisoned when a real MCE gets triggered, why would we want a pte marker to also reflect that? Or is that something for userfaultd realm? > And also not report swapin error is, IMHO, only because arch errors said > "MCE" in the error logs which may not apply here. Logically speaking > swapin error should also be reported so admin knows better on why a proc is > killed. Now it can still confuse the admin if it really happens, iiuc. I am bit confused by this. It seems we create poisoned pte markers on swap errors (e.g: unuse_pte()), which get passed down the chain with VM_FAULT_HWPOISON, which end up in sigbus (I guess?). This all seems very subtle to me. First of all, why not passing VM_FAULT_SIGBUS if that is what will end up happening? I mean, at the moment that is not possible because we convolute swaping errors and uffd poison in the same type of marker, so we do not have any means to differentiate between the two of them. Would it make sense to create yet another pte marker type to split that up? Because when I look at VM_FAULT_HWPOISON, I get reminded of MCE stuff, and that does not hold here.
On Tue, May 14, 2024 at 10:26:49PM +0200, Oscar Salvador wrote: > On Fri, May 10, 2024 at 03:29:48PM -0400, Peter Xu wrote: > > IMHO we shouldn't mention that detail, but only state the effect which is > > to not report the event to syslog. > > > > There's no hard rule that a pte marker can't reflect a real page poison in > > the future even MCE. Actually I still remember most places don't care > > about the pfn in the hwpoison swap entry so maybe we can even do it? But > > that's another story regardless.. > > But we should not use pte markers for real hwpoisons events (aka MCE), right? The question is whether we can't. Now we reserved a swp entry just for hwpoison and it makes sense only because we cached the poisoned pfn inside. My long standing question is why do we ever need that pfn after all. If we don't need the pfn, we simply need a bit in the pgtable entry saying that it's poisoned, if accessed we should kill the process using sigbus. I used to comment on this before, the only path that uses that pfn is check_hwpoisoned_entry(), which was introduced in: commit a3f5d80ea401ac857f2910e28b15f35b2cf902f4 Author: Naoya Horiguchi <nao.horiguchi@gmail.com> Date: Mon Jun 28 19:43:14 2021 -0700 mm,hwpoison: send SIGBUS with error virutal address Now an action required MCE in already hwpoisoned address surely sends a SIGBUS to current process, but the SIGBUS doesn't convey error virtual address. That's not optimal for hwpoison-aware applications. To fix the issue, make memory_failure() call kill_accessing_process(), that does pagetable walk to find the error virtual address. It could find multiple virtual addresses for the same error page, and it seems hard to tell which virtual address is correct one. But that's rare and sending incorrect virtual address could be better than no address. So let's report the first found virtual address for now. So this time I read more on this and Naoya explained why - it's only used so far to dump the VA of the poisoned entry. However what confused me is, if an entry is poisoned already logically we dump that message in the fault handler not memory_failure(), which is: MCE: Killing uffd-unit-tests:650 due to hardware memory corruption fault at 7f3589d7e000 So perhaps we're trying to also dump that when the MCEs (points to the same pfn) are only generated concurrently? I donno much on hwpoison so I cannot tell, there's also implication where it's only triggered if MF_ACTION_REQUIRED. But I think it means hwpoison may work without pfn encoded, but I don't know the implication to lose that dmesg line. > I mean, we do have the means to mark a page as hwpoisoned when a real > MCE gets triggered, why would we want a pte marker to also reflect that? > Or is that something for userfaultd realm? No it's not userfaultfd realm.. it's just that pte marker should be a generic concept, so it logically can be used outside userfaultfd. That's also why it's used in swapin errors, in which case we don't use anything else in this case but a bit to reflect "this page is bad". > > > And also not report swapin error is, IMHO, only because arch errors said > > "MCE" in the error logs which may not apply here. Logically speaking > > swapin error should also be reported so admin knows better on why a proc is > > killed. Now it can still confuse the admin if it really happens, iiuc. > > I am bit confused by this. > It seems we create poisoned pte markers on swap errors (e.g: > unuse_pte()), which get passed down the chain with VM_FAULT_HWPOISON, > which end up in sigbus (I guess?). > > This all seems very subtle to me. > > First of all, why not passing VM_FAULT_SIGBUS if that is what will end > up happening? > I mean, at the moment that is not possible because we convolute swaping > errors and uffd poison in the same type of marker, so we do not have any > means to differentiate between the two of them. > > Would it make sense to create yet another pte marker type to split that > up? Because when I look at VM_FAULT_HWPOISON, I get reminded of MCE > stuff, and that does not hold here. We used to not dump error for swapin error. Note that here what I am saying is not that Axel is doing things wrong, but it's just that logically swapin error (as pte marker) can also be with !QUIET, so my final point is we may want to avoid having the assumption that "pte marker should always be QUITE", because I want to make it clear that pte marker can used in any form, so itself shouldn't imply anything.. Thanks,
On Tue, May 14, 2024 at 03:34:24PM -0600, Peter Xu wrote: > The question is whether we can't. > > Now we reserved a swp entry just for hwpoison and it makes sense only > because we cached the poisoned pfn inside. My long standing question is > why do we ever need that pfn after all. If we don't need the pfn, we > simply need a bit in the pgtable entry saying that it's poisoned, if > accessed we should kill the process using sigbus. > > I used to comment on this before, the only path that uses that pfn is > check_hwpoisoned_entry(), which was introduced in: > > commit a3f5d80ea401ac857f2910e28b15f35b2cf902f4 > Author: Naoya Horiguchi <nao.horiguchi@gmail.com> > Date: Mon Jun 28 19:43:14 2021 -0700 > > mm,hwpoison: send SIGBUS with error virutal address > > Now an action required MCE in already hwpoisoned address surely sends a > SIGBUS to current process, but the SIGBUS doesn't convey error virtual > address. That's not optimal for hwpoison-aware applications. > > To fix the issue, make memory_failure() call kill_accessing_process(), > that does pagetable walk to find the error virtual address. It could find > multiple virtual addresses for the same error page, and it seems hard to > tell which virtual address is correct one. But that's rare and sending > incorrect virtual address could be better than no address. So let's > report the first found virtual address for now. > > So this time I read more on this and Naoya explained why - it's only used > so far to dump the VA of the poisoned entry. Well, not really dumped, but we just pass the VA down the chain to the signal handler. But on the question whether we need the pfn encoded, I am not sure actually. check_hwpoisoned_entry() checks whether the pfn where the pte sits is the same as the hwpoisoned one, so we know if the process has to be killed. Now, could we get away with using pte_page() instead of pte_pfn() in there, and passing the hwpoisoned page instead ot the pfn? I am trying to think of the implications, then we should not need the encoded pfn? > However what confused me is, if an entry is poisoned already logically we > dump that message in the fault handler not memory_failure(), which is: > > MCE: Killing uffd-unit-tests:650 due to hardware memory corruption fault at 7f3589d7e000 > > So perhaps we're trying to also dump that when the MCEs (points to the same > pfn) are only generated concurrently? I donno much on hwpoison so I cannot > tell, there's also implication where it's only triggered if > MF_ACTION_REQUIRED. But I think it means hwpoison may work without pfn > encoded, but I don't know the implication to lose that dmesg line. Not necesarily concurrently, but simply if for any reason the page could not have been unmapped. Let us say you have ProcessA and ProcessB mapping the same page. We get an MCE for that page but we could not unmapped it, at least not from all processes (maybe just ProcessA). memory-failure code will mark it as hwpoison, now ProcessA faults it in and gets killed because we see that the page is hwpoisoned in the fault path, so we sill send VM_FAULT_HWPOISON all the way down till you see the: "MCE: Killing uffd-unit-tests:650 due to hardware memory corruption fault at 7f3589d7e000" from e.g: arch/x86/mm/fault.c:do_sigbus() Now, ProcessB still has the page mapped, so upon re-accessing it, it will trigger a new MCE event. memory-failure code will see that this page has already been marked as hwpoisoned, but since we failed to unmap it (otherwise no one should be re-accessing it), it goes: "ok, let's just kill everybody who has this page mapped". > We used to not dump error for swapin error. Note that here what I am > saying is not that Axel is doing things wrong, but it's just that logically > swapin error (as pte marker) can also be with !QUIET, so my final point is > we may want to avoid having the assumption that "pte marker should always > be QUITE", because I want to make it clear that pte marker can used in any > form, so itself shouldn't imply anything.. I think it would make more sense if we have a separate marker for swapin errors? I mean, deep down, they do not mean the same as poison, right? Then you can choose which events get to be silent because you do not care, and which ones need to scream loud. I think swapin errors belong to the latter. At least I a heads-up why a process is getting killed is appreciated, IMHO.
On Fri, May 10, 2024 at 11:29:26AM -0700, Axel Rasmussen wrote: > @@ -3938,7 +3938,7 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf) > > /* Higher priority than uffd-wp when data corrupted */ > if (marker & PTE_MARKER_POISONED) > - return VM_FAULT_HWPOISON; > + return VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_SILENT; If you know here that this poisoning should be silent, why do you have to make it all complicated and propagate it into arch code, waste a separate VM_FAULT flag just for that instead of simply returning here a VM_FAULT_COMPLETED or some other innocuous value which would stop processing the fault?
On Wed, May 15, 2024 at 12:41:42PM +0200, Borislav Petkov wrote: > On Fri, May 10, 2024 at 11:29:26AM -0700, Axel Rasmussen wrote: > > @@ -3938,7 +3938,7 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf) > > > > /* Higher priority than uffd-wp when data corrupted */ > > if (marker & PTE_MARKER_POISONED) > > - return VM_FAULT_HWPOISON; > > + return VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_SILENT; > > If you know here that this poisoning should be silent, why do you have > to make it all complicated and propagate it into arch code, waste > a separate VM_FAULT flag just for that instead of simply returning here > a VM_FAULT_COMPLETED or some other innocuous value which would stop > processing the fault? AFAIK, He only wants it to be silent wrt. the arch fault handler not screaming, but he still wants to be able to trigger force_sig_mceerr().
On Wed, May 15, 2024 at 3:54 AM Oscar Salvador <osalvador@suse.de> wrote: > > On Wed, May 15, 2024 at 12:41:42PM +0200, Borislav Petkov wrote: > > On Fri, May 10, 2024 at 11:29:26AM -0700, Axel Rasmussen wrote: > > > @@ -3938,7 +3938,7 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf) > > > > > > /* Higher priority than uffd-wp when data corrupted */ > > > if (marker & PTE_MARKER_POISONED) > > > - return VM_FAULT_HWPOISON; > > > + return VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_SILENT; > > > > If you know here that this poisoning should be silent, why do you have > > to make it all complicated and propagate it into arch code, waste > > a separate VM_FAULT flag just for that instead of simply returning here > > a VM_FAULT_COMPLETED or some other innocuous value which would stop > > processing the fault? > > AFAIK, He only wants it to be silent wrt. the arch fault handler not screaming, > but he still wants to be able to trigger force_sig_mceerr(). Right, the goal is to still have the process get a SIGBUS, but to avoid the "MCE error" log message. The basic issue is, unprivileged users can set these markers up, and thereby completely spam up the log. Also since this is a process-specific thing, and it's not a real hardware poison event, it's unclear system admins care at all at a global level (this is why we didn't want to switch to just printk_ratelimited for example). Better to let the process handle the SIGBUS however it likes for its use case (logging a message elsewhere, etc.). That said, one thing I'm not sure about is whether or not VM_FAULT_SIGBUS is a viable alternative (returned for a new PTE marker type specific to simulated poison). The goal of the simulated poison feature is to "closely simulate" a real hardware poison event. If you live migrate a VM from a host with real poisoned memory, to a new host: you'd want to keep the same behavior if the guest accessed those addresses again, so as not to confuse the guest about why it suddenly became "un-poisoned". At a basic level I think VM_FAULT_SIGBUS gives us what we want (send SIGBUS to the process, don't log about MCEs), but I'm not confident I know all the differences vs. VM_FAULT_HWPOISON on all the arches. > > > -- > Oscar Salvador > SUSE Labs
On Wed, May 15, 2024 at 10:33:03AM -0700, Axel Rasmussen wrote: > Right, the goal is to still have the process get a SIGBUS, but to > avoid the "MCE error" log message. The basic issue is, unprivileged > users can set these markers up, and thereby completely spam up the > log. What is the real attack scenario you want to protect against? Or is this something hypothetical? > That said, one thing I'm not sure about is whether or not > VM_FAULT_SIGBUS is a viable alternative (returned for a new PTE marker > type specific to simulated poison). The goal of the simulated poison > feature is to "closely simulate" a real hardware poison event. If you > live migrate a VM from a host with real poisoned memory, to a new > host: you'd want to keep the same behavior if the guest accessed those > addresses again, so as not to confuse the guest about why it suddenly > became "un-poisoned". Well, the recovery action is to poison the page and the process should be resilient enough and allocate a new, clean page which doesn't trigger hw poison hopefully, if possible. It doesn't make a whole lotta sense if poison "remains". Hardware poison you don't want to touch a second time either - otherwise you might consume that poison and die.
On Wed, May 15, 2024 at 11:33 AM Borislav Petkov <bp@alien8.de> wrote: > > On Wed, May 15, 2024 at 10:33:03AM -0700, Axel Rasmussen wrote: > > Right, the goal is to still have the process get a SIGBUS, but to > > avoid the "MCE error" log message. The basic issue is, unprivileged > > users can set these markers up, and thereby completely spam up the > > log. > > What is the real attack scenario you want to protect against? > > Or is this something hypothetical? An unprivileged process can allocate a VMA, use the userfaultfd API to install one of these PTE markers, and then register a no-op SIGBUS handler. Now it can access that address in a tight loop, generating a huge number of kernel log messages. This can e.g. bog down the system, or at least drown out other important log messages. For example the userfaultfd selftest does something similar to this to test that the API works properly. :) Even in a non-contrived / non-malicious case, use of this API could have similar effects. If nothing else, the log message can be confusing to administrators: they state that an MCE occurred, whereas with the simulated poison API, this is not the case; it isn't a "real" MCE / hardware error. > > > That said, one thing I'm not sure about is whether or not > > VM_FAULT_SIGBUS is a viable alternative (returned for a new PTE marker > > type specific to simulated poison). The goal of the simulated poison > > feature is to "closely simulate" a real hardware poison event. If you > > live migrate a VM from a host with real poisoned memory, to a new > > host: you'd want to keep the same behavior if the guest accessed those > > addresses again, so as not to confuse the guest about why it suddenly > > became "un-poisoned". > > Well, the recovery action is to poison the page and the process should > be resilient enough and allocate a new, clean page which doesn't trigger > hw poison hopefully, if possible. > > It doesn't make a whole lotta sense if poison "remains". Hardware poison > you don't want to touch a second time either - otherwise you might > consume that poison and die. In the KVM use case, the host can't just allocate a new page, because it doesn't know what the guest might have had stored there. Best we can do is propagate the poison into the guest, and let the guest OS deal with it as it sees fit, and mark the page poisoned on the host. I don't disagree the guest *shouldn't* reaccess it in this case. :) But if it did, it should get another poison event just as you say. And, live migration between physical hosts should be transparent to the guest. So if the guest gets a poison, and then we live migrate it, and then it accesses that address again, it should likewise get another poison event, just as before. Even though the underlying physical memory is *not* poisoned on the new host machine. So the use case is, after live migration, we install one of these PTE markers to simulate a poison event in case the address is accessed again. But since it isn't *really* a hardware error on the new host, no reason to spam the host kernel log when / if this occurs. > > -- > Regards/Gruss, > Boris. > > https://people.kernel.org/tglx/notes-about-netiquette
On Wed, May 15, 2024 at 12:19:16PM -0700, Axel Rasmussen wrote: > An unprivileged process can allocate a VMA, use the userfaultfd API to > install one of these PTE markers, and then register a no-op SIGBUS > handler. Now it can access that address in a tight loop, Maybe the userfaultfd should not allow this, I dunno. You made me look at this thing and to me it all sounds weird. One thread does page fault handling for the other and that helps with live migration somehow. OMG, whaaat? Maybe I don't understand it and probably never will... But, for example, membarrier used do to a stupid thing of allowing one thread to hammer another with an IPI storm. Bad bad idea. So it got fixed. All I'm saying is, if unprivileged processes can do crap, they should be prevented from doing crap. Like ratelimiting the pagefaults or whatnot. One of the recovery action strategies from memory poison is, well, you kill the process. If you can detect the hammering process which installed that page marker, you kill it. Problem solved. But again, this userfaultfd thing sounds really weird so I could very well be way wrong. > Even in a non-contrived / non-malicious case, use of this API could > have similar effects. If nothing else, the log message can be > confusing to administrators: they state that an MCE occurred, whereas > with the simulated poison API, this is not the case; it isn't a "real" > MCE / hardware error. Yeah, I read that part in Documentation/admin-guide/mm/userfaultfd.rst Simulated poison huh? Another WTF. > In the KVM use case, the host can't just allocate a new page, because > it doesn't know what the guest might have had stored there. Best we Ok, let's think of real hw poison. When doing the recovery, you don't care what's stored there because as far as the hardware is concerned, if you consume that poison the *whole* machine might go down. So you lose the page. Plain and simple. And the guest can go visit the bureau of complaints and grievances. Still better than killing the guest or even the whole host with other guests running on it. > can do is propagate the poison into the guest, and let the guest OS > deal with it as it sees fit, and mark the page poisoned on the host. You mark the page as poison on the host and you yank it from under the guest. That physical frame is gone and the faster all the actors involved understand that, the better. > I don't disagree the guest *shouldn't* reaccess it in this case. :) > But if it did, it should get another poison event just as you say. Yes, it shouldn't. Look at memory_failure(). This will kill whole processes if it has to, depending on what the page is used for. > And, live migration between physical hosts should be transparent to > the guest. So if the guest gets a poison, and then we live migrate it, So if I were to design this, I'd do it this way: 0. guest gets hw poison injected 1. it runs memory_failure() and it kills the processes using the page. 2. page is marked poisoned on the host so no other guest gets it. That's it. No second accesses whatsoever. At least this is how it works on baremetal. This hw poisoning emulation is just silly and unnecessary. But again, I probably am missing some aspects. It all just sounded really weird to me that's why I thought I should ask what's behind all that. Thx.
On Wed, May 15, 2024 at 1:19 PM Borislav Petkov <bp@alien8.de> wrote: > > On Wed, May 15, 2024 at 12:19:16PM -0700, Axel Rasmussen wrote: > > An unprivileged process can allocate a VMA, use the userfaultfd API to > > install one of these PTE markers, and then register a no-op SIGBUS > > handler. Now it can access that address in a tight loop, > > Maybe the userfaultfd should not allow this, I dunno. You made me look > at this thing and to me it all sounds weird. One thread does page fault > handling for the other and that helps with live migration somehow. OMG, > whaaat? > > Maybe I don't understand it and probably never will... > > But, for example, membarrier used do to a stupid thing of allowing one > thread to hammer another with an IPI storm. Bad bad idea. So it got > fixed. > > All I'm saying is, if unprivileged processes can do crap, they should be > prevented from doing crap. Like ratelimiting the pagefaults or whatnot. > > One of the recovery action strategies from memory poison is, well, you > kill the process. If you can detect the hammering process which > installed that page marker, you kill it. Problem solved. > > But again, this userfaultfd thing sounds really weird so I could very > well be way wrong. > > > Even in a non-contrived / non-malicious case, use of this API could > > have similar effects. If nothing else, the log message can be > > confusing to administrators: they state that an MCE occurred, whereas > > with the simulated poison API, this is not the case; it isn't a "real" > > MCE / hardware error. > > Yeah, I read that part in > > Documentation/admin-guide/mm/userfaultfd.rst > > Simulated poison huh? Another WTF. > > > In the KVM use case, the host can't just allocate a new page, because > > it doesn't know what the guest might have had stored there. Best we > > Ok, let's think of real hw poison. > > When doing the recovery, you don't care what's stored there because as > far as the hardware is concerned, if you consume that poison the *whole* > machine might go down. > > So you lose the page. Plain and simple. And the guest can go visit the > bureau of complaints and grievances. > > Still better than killing the guest or even the whole host with other > guests running on it. > > > can do is propagate the poison into the guest, and let the guest OS > > deal with it as it sees fit, and mark the page poisoned on the host. > > You mark the page as poison on the host and you yank it from under the > guest. That physical frame is gone and the faster all the actors > involved understand that, the better. > > > I don't disagree the guest *shouldn't* reaccess it in this case. :) > > But if it did, it should get another poison event just as you say. > > Yes, it shouldn't. Look at memory_failure(). This will kill whole > processes if it has to, depending on what the page is used for. > > > And, live migration between physical hosts should be transparent to > > the guest. So if the guest gets a poison, and then we live migrate it, > > So if I were to design this, I'd do it this way: > > 0. guest gets hw poison injected > > 1. it runs memory_failure() and it kills the processes using the page. > > 2. page is marked poisoned on the host so no other guest gets it. > > That's it. No second accesses whatsoever. At least this is how it works > on baremetal. I agree with almost all of the above. But one point is, I don't think we can trust the guest to be reasonable. :) Public cloud provider customers might run some OS other than Linux, or an old / buggy kernel, or one with out-of-tree patches which make it do who knows what. There can also be users who are actively malicious. Some customers may try to do fancy "poison recovery" where they can avoid killing the in-guest process when a poison event occurs. These implementations can be buggy :) and unintentionally reaccess. > > This hw poisoning emulation is just silly and unnecessary. > > But again, I probably am missing some aspects. It all just sounded > really weird to me that's why I thought I should ask what's behind all > that. > > Thx. > > -- > Regards/Gruss, > Boris. > > https://people.kernel.org/tglx/notes-about-netiquette
On Wed, May 15, 2024 at 12:21:51PM +0200, Oscar Salvador wrote: > On Tue, May 14, 2024 at 03:34:24PM -0600, Peter Xu wrote: > > The question is whether we can't. > > > > Now we reserved a swp entry just for hwpoison and it makes sense only > > because we cached the poisoned pfn inside. My long standing question is > > why do we ever need that pfn after all. If we don't need the pfn, we > > simply need a bit in the pgtable entry saying that it's poisoned, if > > accessed we should kill the process using sigbus. > > > > I used to comment on this before, the only path that uses that pfn is > > check_hwpoisoned_entry(), which was introduced in: > > > > commit a3f5d80ea401ac857f2910e28b15f35b2cf902f4 > > Author: Naoya Horiguchi <nao.horiguchi@gmail.com> > > Date: Mon Jun 28 19:43:14 2021 -0700 > > > > mm,hwpoison: send SIGBUS with error virutal address > > > > Now an action required MCE in already hwpoisoned address surely sends a > > SIGBUS to current process, but the SIGBUS doesn't convey error virtual > > address. That's not optimal for hwpoison-aware applications. > > > > To fix the issue, make memory_failure() call kill_accessing_process(), > > that does pagetable walk to find the error virtual address. It could find > > multiple virtual addresses for the same error page, and it seems hard to > > tell which virtual address is correct one. But that's rare and sending > > incorrect virtual address could be better than no address. So let's > > report the first found virtual address for now. > > > > So this time I read more on this and Naoya explained why - it's only used > > so far to dump the VA of the poisoned entry. > > Well, not really dumped, but we just pass the VA down the chain to the > signal handler. > > But on the question whether we need the pfn encoded, I am not sure > actually. > check_hwpoisoned_entry() checks whether the pfn where the pte sits is > the same as the hwpoisoned one, so we know if the process has to be > killed. > > Now, could we get away with using pte_page() instead of pte_pfn() in there, and > passing the hwpoisoned page instead ot the pfn? > I am trying to think of the implications, then we should not need the > encoded pfn? I sincerely don't know; we need help from someone know better on hwpoison, maybe. It's at least not needed in fault paths, afaiu. > > > However what confused me is, if an entry is poisoned already logically we > > dump that message in the fault handler not memory_failure(), which is: > > > > MCE: Killing uffd-unit-tests:650 due to hardware memory corruption fault at 7f3589d7e000 > > > > So perhaps we're trying to also dump that when the MCEs (points to the same > > pfn) are only generated concurrently? I donno much on hwpoison so I cannot > > tell, there's also implication where it's only triggered if > > MF_ACTION_REQUIRED. But I think it means hwpoison may work without pfn > > encoded, but I don't know the implication to lose that dmesg line. > > Not necesarily concurrently, but simply if for any reason the page could > not have been unmapped. > Let us say you have ProcessA and ProcessB mapping the same page. > We get an MCE for that page but we could not unmapped it, at least not > from all processes (maybe just ProcessA). > > memory-failure code will mark it as hwpoison, now ProcessA faults it in > and gets killed because we see that the page is hwpoisoned in the fault > path, so we sill send VM_FAULT_HWPOISON all the way down till you see > the: > > "MCE: Killing uffd-unit-tests:650 due to hardware memory corruption > fault at 7f3589d7e000" from e.g: arch/x86/mm/fault.c:do_sigbus() > > Now, ProcessB still has the page mapped, so upon re-accessing it, > it will trigger a new MCE event. memory-failure code will see that this The question is why accessing that hwpoison entry from ProcB triggers an MCE. Logically that's a swap entry and it should generate a page fault rather than MCE. Then in the pgfault hanlder we don't need that encoded pfn as we have vmf->address. > page has already been marked as hwpoisoned, but since we failed to > unmap it (otherwise no one should be re-accessing it), it goes: "ok, > let's just kill everybody who has this page mapped". > > > > We used to not dump error for swapin error. Note that here what I am > > saying is not that Axel is doing things wrong, but it's just that logically > > swapin error (as pte marker) can also be with !QUIET, so my final point is > > we may want to avoid having the assumption that "pte marker should always > > be QUITE", because I want to make it clear that pte marker can used in any > > form, so itself shouldn't imply anything.. > > I think it would make more sense if we have a separate marker for swapin > errors? > I mean, deep down, they do not mean the same as poison, right? > > Then you can choose which events get to be silent because you do not > care, and which ones need to scream loud. > I think swapin errors belong to the latter. At least I a heads-up why a > process is getting killed is appreciated, IMHO. Right it can be separate, and that was the plan IIRC. But maybe an overkill for now if nobody cared, and we can wait until someone really cares about that. After all adding a dmesg line for such event is much easier than removing one.. Thanks,
On Wed, May 15, 2024 at 10:18:31PM +0200, Borislav Petkov wrote: > So if I were to design this, I'd do it this way: > > 0. guest gets hw poison injected > > 1. it runs memory_failure() and it kills the processes using the page. > > 2. page is marked poisoned on the host so no other guest gets it. > > That's it. No second accesses whatsoever. At least this is how it works > on baremetal. > > This hw poisoning emulation is just silly and unnecessary. We (QEMU) haven't yet consumed this.. but I think it makes sense to have such emulation, as it's slightly different from a real hwpoison. I think the important bit that's missing in this picture is migration, that the VM can migrate from one host to another, carrying that poisoned PFN. Let's assume we have two hosts: src and dst. Currently VM runs on src host. Before migration, there is a real PFN that is bad, MCE injected. When accesssed by either guest vcpu or host cpu / hypervisor, VM gets killed. This is so far the same to any process that has a bad page. However it's possible a VM got migrated _before_ that bad PFN accessed, in this case the VM is still legal to run, the hypervisor will not migrate that bad PFN data knowing that its data is invalid. What it does is it'll tell dst that "this guest PFN is bad, if guest access it let's crash it". Then what dst host needs is a way to describe "this guest PFN is bad": the easiest way is to describe "this VA of the process is bad", meanwhile there'll be no real page backing that VA anyway, and also no real poisoned pages. We want to poison a VA only. That's why an emulation is needed. Besides that we want to get exactly whatever we'll get for a real hwpoison, e.g. SIGBUS with the address encoded, then KVM work naturally with that just like a real MCE. One other thing we can do is to inject-poison to the VA together with the page backing it, but that'll pollute a PFN on dst host to be a real bad PFN and won't be able to be used by the dst OS anymore, so it's less optimal. Thanks,
On Wed, May 22, 2024 at 05:46:09PM -0400, Peter Xu wrote: > > Now, ProcessB still has the page mapped, so upon re-accessing it, > > it will trigger a new MCE event. memory-failure code will see that this > > The question is why accessing that hwpoison entry from ProcB triggers an > MCE. Logically that's a swap entry and it should generate a page fault > rather than MCE. Then in the pgfault hanlder we don't need that encoded > pfn as we have vmf->address. It would be a swap entry if we reach try_to_umap_one() without trouble. Then we have the code that converts it: ... if (PageHWPoison(p)) pteval = swp_entry_to_pte(make_hwpoison_entry(subpage)); set_{huge_}pte_at ... But maybe we could only do that for ProcA, while ProcB failed to do that, which means that for ProcA that is a hwpoisoned-swap-entry, but ProcB still has this page mapped as usual, so if ProcB re-access it, that will not trigger a fault (because the page is still mapped in its pagetables).
On Thu, May 23, 2024 at 05:08:29AM +0200, Oscar Salvador wrote: > On Wed, May 22, 2024 at 05:46:09PM -0400, Peter Xu wrote: > > > Now, ProcessB still has the page mapped, so upon re-accessing it, > > > it will trigger a new MCE event. memory-failure code will see that this > > > > The question is why accessing that hwpoison entry from ProcB triggers an > > MCE. Logically that's a swap entry and it should generate a page fault > > rather than MCE. Then in the pgfault hanlder we don't need that encoded > > pfn as we have vmf->address. > > It would be a swap entry if we reach try_to_umap_one() without trouble. > Then we have the code that converts it: > > ... > if (PageHWPoison(p)) > pteval = swp_entry_to_pte(make_hwpoison_entry(subpage)); > set_{huge_}pte_at > ... > > But maybe we could only do that for ProcA, while ProcB failed to do that, > which means that for ProcA that is a hwpoisoned-swap-entry, but ProcB still > has this page mapped as usual, so if ProcB re-access it, that will not > trigger a fault (because the page is still mapped in its pagetables). But in that case "whether encode pfn in hwpoison swap entry" doesn't matter either.. as it's not yet converted to a swap entry, so the pfn is there. Thanks,
diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c index c39de84e98b0..6c5e8d6498bf 100644 --- a/arch/parisc/mm/fault.c +++ b/arch/parisc/mm/fault.c @@ -400,9 +400,12 @@ void do_page_fault(struct pt_regs *regs, unsigned long code, #ifdef CONFIG_MEMORY_FAILURE if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) { unsigned int lsb = 0; - printk(KERN_ERR + + if (!(fault & VM_FAULT_HWPOISON_SILENT)) { + pr_err( "MCE: Killing %s:%d due to hardware memory corruption fault at %08lx\n", - tsk->comm, tsk->pid, address); + tsk->comm, tsk->pid, address); + } /* * Either small page or large page may be poisoned. * In other words, VM_FAULT_HWPOISON_LARGE and diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 215690452495..c43bb6193a80 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -147,8 +147,10 @@ static int do_sigbus(struct pt_regs *regs, unsigned long address, if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) { unsigned int lsb = 0; /* shutup gcc */ - pr_err("MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n", - current->comm, current->pid, address); + if (!(fault & VM_FAULT_HWPOISON_SILENT)) { + pr_err("MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n", + current->comm, current->pid, address); + } if (fault & VM_FAULT_HWPOISON_LARGE) lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 67b18adc75dd..9ae5cc6bd933 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -964,9 +964,11 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, struct task_struct *tsk = current; unsigned lsb = 0; - pr_err( + if (!(fault & VM_FAULT_HWPOISON_SILENT)) { + pr_err( "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n", - tsk->comm, tsk->pid, address); + tsk->comm, tsk->pid, address); + } if (fault & VM_FAULT_HWPOISON_LARGE) lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); if (fault & VM_FAULT_HWPOISON) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 24323c7d0bd4..7663a2725341 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1224,6 +1224,10 @@ typedef __bitwise unsigned int vm_fault_t; * @VM_FAULT_HWPOISON_LARGE: Hit poisoned large page. Index encoded * in upper bits * @VM_FAULT_SIGSEGV: segmentation fault + * @VM_FAULT_HWPOISON_SILENT Hit a poisoned pte marker, which should not be + * logged to dmesg since it's something besides a + * real hardware memory error (swapin error, + * simulated poison via UFFDIO_POISON, etc.). * @VM_FAULT_NOPAGE: ->fault installed the pte, not return page * @VM_FAULT_LOCKED: ->fault locked the returned page * @VM_FAULT_RETRY: ->fault blocked, must retry @@ -1237,20 +1241,21 @@ typedef __bitwise unsigned int vm_fault_t; * */ enum vm_fault_reason { - VM_FAULT_OOM = (__force vm_fault_t)0x000001, - VM_FAULT_SIGBUS = (__force vm_fault_t)0x000002, - VM_FAULT_MAJOR = (__force vm_fault_t)0x000004, - VM_FAULT_HWPOISON = (__force vm_fault_t)0x000010, - VM_FAULT_HWPOISON_LARGE = (__force vm_fault_t)0x000020, - VM_FAULT_SIGSEGV = (__force vm_fault_t)0x000040, - VM_FAULT_NOPAGE = (__force vm_fault_t)0x000100, - VM_FAULT_LOCKED = (__force vm_fault_t)0x000200, - VM_FAULT_RETRY = (__force vm_fault_t)0x000400, - VM_FAULT_FALLBACK = (__force vm_fault_t)0x000800, - VM_FAULT_DONE_COW = (__force vm_fault_t)0x001000, - VM_FAULT_NEEDDSYNC = (__force vm_fault_t)0x002000, - VM_FAULT_COMPLETED = (__force vm_fault_t)0x004000, - VM_FAULT_HINDEX_MASK = (__force vm_fault_t)0x0f0000, + VM_FAULT_OOM = (__force vm_fault_t)0x000001, + VM_FAULT_SIGBUS = (__force vm_fault_t)0x000002, + VM_FAULT_MAJOR = (__force vm_fault_t)0x000004, + VM_FAULT_HWPOISON = (__force vm_fault_t)0x000010, + VM_FAULT_HWPOISON_LARGE = (__force vm_fault_t)0x000020, + VM_FAULT_SIGSEGV = (__force vm_fault_t)0x000040, + VM_FAULT_HWPOISON_SILENT = (__force vm_fault_t)0x000080, + VM_FAULT_NOPAGE = (__force vm_fault_t)0x000100, + VM_FAULT_LOCKED = (__force vm_fault_t)0x000200, + VM_FAULT_RETRY = (__force vm_fault_t)0x000400, + VM_FAULT_FALLBACK = (__force vm_fault_t)0x000800, + VM_FAULT_DONE_COW = (__force vm_fault_t)0x001000, + VM_FAULT_NEEDDSYNC = (__force vm_fault_t)0x002000, + VM_FAULT_COMPLETED = (__force vm_fault_t)0x004000, + VM_FAULT_HINDEX_MASK = (__force vm_fault_t)0x0f0000, }; /* Encode hstate index for a hwpoisoned large page */ @@ -1268,6 +1273,7 @@ enum vm_fault_reason { { VM_FAULT_HWPOISON, "HWPOISON" }, \ { VM_FAULT_HWPOISON_LARGE, "HWPOISON_LARGE" }, \ { VM_FAULT_SIGSEGV, "SIGSEGV" }, \ + { VM_FAULT_HWPOISON_SILENT, "HWPOISON_SILENT" }, \ { VM_FAULT_NOPAGE, "NOPAGE" }, \ { VM_FAULT_LOCKED, "LOCKED" }, \ { VM_FAULT_RETRY, "RETRY" }, \ diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 6be78e7d4f6e..91517cd7f44c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6485,7 +6485,8 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, pte_marker_get(pte_to_swp_entry(vmf.orig_pte)); if (marker & PTE_MARKER_POISONED) { - ret = VM_FAULT_HWPOISON_LARGE | + ret = VM_FAULT_HWPOISON_SILENT | + VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(hstate_index(h)); goto out_mutex; } diff --git a/mm/memory.c b/mm/memory.c index eea6e4984eae..721c0731cef2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3938,7 +3938,7 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf) /* Higher priority than uffd-wp when data corrupted */ if (marker & PTE_MARKER_POISONED) - return VM_FAULT_HWPOISON; + return VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_SILENT; if (pte_marker_entry_uffd_wp(entry)) return pte_marker_handle_uffd_wp(vmf);