Message ID | 20240726235234.228822-85-seanjc@google.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | KVM: Stop grabbing references to PFNMAP'd pages | expand |
On 7/27/24 01:52, Sean Christopherson wrote: > Now that KVM no longer relies on an ugly heuristic to find its struct page > references, i.e. now that KVM can't get false positives on VM_MIXEDMAP > pfns, remove KVM's hack to elevate the refcount for pfns that happen to > have a valid struct page. In addition to removing a long-standing wart > in KVM, this allows KVM to map non-refcounted struct page memory into the > guest, e.g. for exposing GPU TTM buffers to KVM guests. Feel free to leave it to me for later, but there are more cleanups that can be made, given how simple kvm_resolve_pfn() is now: > @@ -2814,35 +2768,10 @@ static kvm_pfn_t kvm_resolve_pfn(struct kvm_follow_pfn *kfp, struct page *page, > if (kfp->map_writable) > *kfp->map_writable = writable; > > if (pte) > pfn = pte_pfn(*pte); > else > pfn = page_to_pfn(page); > > *kfp->refcounted_page = page; > Something like (untested/uncompiled): --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2758,32 +2758,12 @@ static inline int check_user_page_hwpois return rc == -EHWPOISON; } -static kvm_pfn_t kvm_resolve_pfn(struct kvm_follow_pfn *kfp, struct page *page, - pte_t *pte, bool writable) -{ - kvm_pfn_t pfn; - - WARN_ON_ONCE(!!page == !!pte); - - if (kfp->map_writable) - *kfp->map_writable = writable; - - if (pte) - pfn = pte_pfn(*pte); - else - pfn = page_to_pfn(page); - - *kfp->refcounted_page = page; - - return pfn; -} - /* * The fast path to get the writable pfn which will be stored in @pfn, * true indicates success, otherwise false is returned. It's also the * only part that runs if we can in atomic context. */ -static bool hva_to_pfn_fast(struct kvm_follow_pfn *kfp, kvm_pfn_t *pfn) +static bool hva_to_page_fast(struct kvm_follow_pfn *kfp) { struct page *page; bool r; @@ -2799,23 +2779,21 @@ static bool hva_to_pfn_fast(struct kvm_f return false; if (kfp->pin) - r = pin_user_pages_fast(kfp->hva, 1, FOLL_WRITE, &page) == 1; + r = pin_user_pages_fast(kfp->hva, 1, FOLL_WRITE, kfp->refcounted_page) == 1; else - r = get_user_page_fast_only(kfp->hva, FOLL_WRITE, &page); + r = get_user_page_fast_only(kfp->hva, FOLL_WRITE, kfp->refcounted_page); - if (r) { - *pfn = kvm_resolve_pfn(kfp, page, NULL, true); - return true; - } + if (r) + kfp->flags |= FOLL_WRITE; - return false; + return r; } /* * The slow path to get the pfn of the specified host virtual address, * 1 indicates success, -errno is returned if error is detected. */ -static int hva_to_pfn_slow(struct kvm_follow_pfn *kfp, kvm_pfn_t *pfn) +static int hva_to_page(struct kvm_follow_pfn *kfp) { /* * When a VCPU accesses a page that is not mapped into the secondary @@ -2829,34 +2807,32 @@ static int hva_to_pfn_slow(struct kvm_fo * implicitly honor NUMA hinting faults and don't need this flag. */ unsigned int flags = FOLL_HWPOISON | FOLL_HONOR_NUMA_FAULT | kfp->flags; - struct page *page, *wpage; + struct page *wpage; int npages; + if (hva_to_page_fast(kfp)) + return 1; + if (kfp->pin) - npages = pin_user_pages_unlocked(kfp->hva, 1, &page, flags); + npages = pin_user_pages_unlocked(kfp->hva, 1, kfp->refcounted_page, flags); else - npages = get_user_pages_unlocked(kfp->hva, 1, &page, flags); - if (npages != 1) - return npages; + npages = get_user_pages_unlocked(kfp->hva, 1, kfp->refcounted_page, flags); /* - * Pinning is mutually exclusive with opportunistically mapping a read - * fault as writable, as KVM should never pin pages when mapping memory - * into the guest (pinning is only for direct accesses from KVM). + * Map read fault as writable if possible; pinning is mutually exclusive + * with opportunistically mapping a read fault as writable, as KVM should + * should never pin pages when mapping memory into the guest (pinning is + * only for direct accesses from KVM). */ - if (WARN_ON_ONCE(kfp->map_writable && kfp->pin)) - goto out; - - /* map read fault as writable if possible */ - if (!(flags & FOLL_WRITE) && kfp->map_writable && + if (npages == 1 && + kfp->map_writable && !WARN_ON_ONCE(kfp->pin) && + !(flags & FOLL_WRITE) && get_user_page_fast_only(kfp->hva, FOLL_WRITE, &wpage)) { - put_page(page); - page = wpage; - flags |= FOLL_WRITE; + put_page(kfp->refcounted_page); + kfp->refcounted_page = wpage; + kfp->flags |= FOLL_WRITE; } -out: - *pfn = kvm_resolve_pfn(kfp, page, NULL, flags & FOLL_WRITE); return npages; } @@ -2915,7 +2891,9 @@ static int hva_to_pfn_remapped(struct vm goto out; } - *p_pfn = kvm_resolve_pfn(kfp, NULL, &pte, pte_write(pte)); + if (kfp->map_writable) + *kfp->map_writable = pte_write(pte); + *p_pfn = pte_pfn(pte); out: pte_unmap_unlock(ptep, ptl); return r; @@ -2932,12 +2910,13 @@ kvm_pfn_t hva_to_pfn(struct kvm_follow_p if (WARN_ON_ONCE(!kfp->refcounted_page)) return KVM_PFN_ERR_FAULT; - if (hva_to_pfn_fast(kfp, &pfn)) - return pfn; + npages = hva_to_page(kfp); + if (npages == 1) { + if (kfp->map_writable) + *kfp->map_writable = kfp->flags & FOLL_WRITE; + return page_to_pfn(kfp->refcounted_page); + } - npages = hva_to_pfn_slow(kfp, &pfn); - if (npages == 1) - return pfn; if (npages == -EINTR) return KVM_PFN_ERR_SIGPENDING; Also, check_user_page_hwpoison() should not be needed anymore, probably not since commit 234b239bea39 ("kvm: Faults which trigger IO release the mmap_sem", 2014-09-24) removed get_user_pages_fast() from hva_to_pfn_slow(). The only way that you could get a poisoned page without returning -EHWPOISON, is if FOLL_HWPOISON was not passed. But even without these patches, the cases are: - npages == 0, then you must have FOLL_NOWAIT and you'd not use check_user_page_hwpoison() - npages == 1 or npages == -EHWPOISON, all good - npages == -EAGAIN from mmap_read_lock_killable() - should handle that like -EINTR - everything else including -EFAULT can go downt the vma_lookup() path, because npages < 0 means we went through hva_to_pfn_slow() which uses FOLL_HWPOISON This means that you can simply have if (npages == -EHWPOISON) return KVM_PFN_ERR_HWPOISON; before the mmap_read_lock() line. You may either sneak this at the beginning of the series or leave it for later. Paolo
On Tue, Jul 30, 2024, Paolo Bonzini wrote: > On 7/27/24 01:52, Sean Christopherson wrote: > > Now that KVM no longer relies on an ugly heuristic to find its struct page > > references, i.e. now that KVM can't get false positives on VM_MIXEDMAP > > pfns, remove KVM's hack to elevate the refcount for pfns that happen to > > have a valid struct page. In addition to removing a long-standing wart > > in KVM, this allows KVM to map non-refcounted struct page memory into the > > guest, e.g. for exposing GPU TTM buffers to KVM guests. > > Feel free to leave it to me for later, but there are more cleanups that > can be made, given how simple kvm_resolve_pfn() is now: I'll revisit kvm_resolve_pfn(), Maxim also wasn't a fan of a similar helper that existed in v11. > Also, check_user_page_hwpoison() should not be needed anymore, probably > not since commit 234b239bea39 ("kvm: Faults which trigger IO release the > mmap_sem", 2014-09-24) removed get_user_pages_fast() from hva_to_pfn_slow(). Ha, I *knew* this sounded familiar. Past me apparently came to the same conclusion[*], though I wrongly suspected a memory leak and promptly forgot to ever send a patch. I'll tack one on this time around. [*] https://lore.kernel.org/all/ZGKC9fHoE+kDs0ar@google.com > The only way that you could get a poisoned page without returning -EHWPOISON, > is if FOLL_HWPOISON was not passed. But even without these patches, > the cases are: > - npages == 0, then you must have FOLL_NOWAIT and you'd not use > check_user_page_hwpoison() > - npages == 1 or npages == -EHWPOISON, all good > - npages == -EAGAIN from mmap_read_lock_killable() - should handle that like -EINTR > - everything else including -EFAULT can go downt the vma_lookup() path, because > npages < 0 means we went through hva_to_pfn_slow() which uses FOLL_HWPOISON > > This means that you can simply have > > if (npages == -EHWPOISON) > return KVM_PFN_ERR_HWPOISON; > > before the mmap_read_lock() line. You may either sneak this at the beginning > of the series or leave it for later. > > Paolo >
On 7/30/24 22:21, Sean Christopherson wrote: > On Tue, Jul 30, 2024, Paolo Bonzini wrote: >> On 7/27/24 01:52, Sean Christopherson wrote: >>> Now that KVM no longer relies on an ugly heuristic to find its struct page >>> references, i.e. now that KVM can't get false positives on VM_MIXEDMAP >>> pfns, remove KVM's hack to elevate the refcount for pfns that happen to >>> have a valid struct page. In addition to removing a long-standing wart >>> in KVM, this allows KVM to map non-refcounted struct page memory into the >>> guest, e.g. for exposing GPU TTM buffers to KVM guests. >> >> Feel free to leave it to me for later, but there are more cleanups that >> can be made, given how simple kvm_resolve_pfn() is now: > > I'll revisit kvm_resolve_pfn(), Maxim also wasn't a fan of a similar helper that > existed in v11. FWIW kvm_resolve_pfn() is totally fine as an intermediate step. Just food for thought for possible follow-ups. >> Also, check_user_page_hwpoison() should not be needed anymore, probably >> not since commit 234b239bea39 ("kvm: Faults which trigger IO release the >> mmap_sem", 2014-09-24) removed get_user_pages_fast() from hva_to_pfn_slow(). > > Ha, I *knew* this sounded familiar. Past me apparently came to the same > conclusion[*], though I wrongly suspected a memory leak and promptly forgot to > ever send a patch. I'll tack one on this time around. As you prefer. Paolo
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 87d61f16a449..d4513ffaf2e1 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1702,9 +1702,6 @@ void kvm_arch_sync_events(struct kvm *kvm); int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu); -struct page *kvm_pfn_to_refcounted_page(kvm_pfn_t pfn); -bool kvm_is_zone_device_page(struct page *page); - struct kvm_irq_ack_notifier { struct hlist_node link; unsigned gsi; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 8b85e1130a63..e279140f2425 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -160,52 +160,6 @@ __weak void kvm_arch_guest_memory_reclaimed(struct kvm *kvm) { } -bool kvm_is_zone_device_page(struct page *page) -{ - /* - * The metadata used by is_zone_device_page() to determine whether or - * not a page is ZONE_DEVICE is guaranteed to be valid if and only if - * the device has been pinned, e.g. by get_user_pages(). WARN if the - * page_count() is zero to help detect bad usage of this helper. - */ - if (WARN_ON_ONCE(!page_count(page))) - return false; - - return is_zone_device_page(page); -} - -/* - * Returns a 'struct page' if the pfn is "valid" and backed by a refcounted - * page, NULL otherwise. Note, the list of refcounted PG_reserved page types - * is likely incomplete, it has been compiled purely through people wanting to - * back guest with a certain type of memory and encountering issues. - */ -struct page *kvm_pfn_to_refcounted_page(kvm_pfn_t pfn) -{ - struct page *page; - - if (!pfn_valid(pfn)) - return NULL; - - page = pfn_to_page(pfn); - if (!PageReserved(page)) - return page; - - /* The ZERO_PAGE(s) is marked PG_reserved, but is refcounted. */ - if (is_zero_pfn(pfn)) - return page; - - /* - * ZONE_DEVICE pages currently set PG_reserved, but from a refcounting - * perspective they are "normal" pages, albeit with slightly different - * usage rules. - */ - if (kvm_is_zone_device_page(page)) - return page; - - return NULL; -} - /* * Switches to specified vcpu, until a matching vcpu_put() */ @@ -2814,35 +2768,10 @@ static kvm_pfn_t kvm_resolve_pfn(struct kvm_follow_pfn *kfp, struct page *page, if (kfp->map_writable) *kfp->map_writable = writable; - /* - * FIXME: Remove this once KVM no longer blindly calls put_page() on - * every pfn that points at a struct page. - * - * Get a reference for follow_pte() pfns if they happen to point at a - * struct page, as KVM will ultimately call kvm_release_pfn_clean() on - * the returned pfn, i.e. KVM expects to have a reference. - * - * Certain IO or PFNMAP mappings can be backed with valid struct pages, - * but be allocated without refcounting, e.g. tail pages of - * non-compound higher order allocations. Grabbing and putting a - * reference to such pages would cause KVM to prematurely free a page - * it doesn't own (KVM gets and puts the one and only reference). - * Don't allow those pages until the FIXME is resolved. - * - * Don't grab a reference for pins, callers that pin pages are required - * to check refcounted_page, i.e. must not blindly release the pfn. - */ - if (pte) { + if (pte) pfn = pte_pfn(*pte); - - if (!kfp->pin) { - page = kvm_pfn_to_refcounted_page(pfn); - if (page && !get_page_unless_zero(page)) - return KVM_PFN_ERR_FAULT; - } - } else { + else pfn = page_to_pfn(page); - } *kfp->refcounted_page = page;
Now that KVM no longer relies on an ugly heuristic to find its struct page references, i.e. now that KVM can't get false positives on VM_MIXEDMAP pfns, remove KVM's hack to elevate the refcount for pfns that happen to have a valid struct page. In addition to removing a long-standing wart in KVM, this allows KVM to map non-refcounted struct page memory into the guest, e.g. for exposing GPU TTM buffers to KVM guests. Signed-off-by: Sean Christopherson <seanjc@google.com> --- include/linux/kvm_host.h | 3 -- virt/kvm/kvm_main.c | 75 ++-------------------------------------- 2 files changed, 2 insertions(+), 76 deletions(-)