@@ -280,7 +280,7 @@ struct kvm_host_map {
* can be used as guest memory but they are not managed by host
* kernel).
*/
- struct page *refcounted_page;
+ struct page *pinned_page;
struct page *page;
void *hva;
kvm_pfn_t pfn;
@@ -2814,9 +2814,12 @@ static kvm_pfn_t kvm_resolve_pfn(struct kvm_follow_pfn *kfp, struct page *page,
*/
if (map) {
pfn = map->pfn;
- page = kvm_pfn_to_refcounted_page(pfn);
- if (page && !get_page_unless_zero(page))
- return KVM_PFN_ERR_FAULT;
+
+ if (!kfp->pin) {
+ page = kvm_pfn_to_refcounted_page(pfn);
+ if (page && !get_page_unless_zero(page))
+ return KVM_PFN_ERR_FAULT;
+ }
} else {
pfn = page_to_pfn(page);
}
@@ -2834,16 +2837,24 @@ static kvm_pfn_t kvm_resolve_pfn(struct kvm_follow_pfn *kfp, struct page *page,
static bool hva_to_pfn_fast(struct kvm_follow_pfn *kfp, kvm_pfn_t *pfn)
{
struct page *page;
+ bool r;
/*
- * Fast pin a writable pfn only if it is a write fault request
- * or the caller allows to map a writable pfn for a read fault
- * request.
+ * Try the fast-only path when the caller wants to pin/get the page for
+ * writing. If the caller only wants to read the page, KVM must go
+ * down the full, slow path in order to avoid racing an operation that
+ * breaks Copy-on-Write (CoW), e.g. so that KVM doesn't end up pointing
+ * at the old, read-only page while mm/ points at a new, writable page.
*/
if (!((kfp->flags & FOLL_WRITE) || kfp->map_writable))
return false;
- if (get_user_page_fast_only(kfp->hva, FOLL_WRITE, &page)) {
+ if (kfp->pin)
+ r = pin_user_pages_fast(kfp->hva, 1, FOLL_WRITE, &page) == 1;
+ else
+ r = get_user_page_fast_only(kfp->hva, FOLL_WRITE, &page);
+
+ if (r) {
*pfn = kvm_resolve_pfn(kfp, page, NULL, true);
return true;
}
@@ -2872,10 +2883,21 @@ static int hva_to_pfn_slow(struct kvm_follow_pfn *kfp, kvm_pfn_t *pfn)
struct page *page, *wpage;
int npages;
- npages = get_user_pages_unlocked(kfp->hva, 1, &page, flags);
+ if (kfp->pin)
+ npages = pin_user_pages_unlocked(kfp->hva, 1, &page, flags);
+ else
+ npages = get_user_pages_unlocked(kfp->hva, 1, &page, flags);
if (npages != 1)
return npages;
+ /*
+ * Pinning is mutually exclusive with opportunistically mapping a read
+ * fault as writable, as KVM should never pin pages when mapping memory
+ * into the guest (pinning is only for direct accesses from KVM).
+ */
+ if (WARN_ON_ONCE(kfp->map_writable && kfp->pin))
+ goto out;
+
/* map read fault as writable if possible */
if (!(flags & FOLL_WRITE) && kfp->map_writable &&
get_user_page_fast_only(kfp->hva, FOLL_WRITE, &wpage)) {
@@ -2884,6 +2906,7 @@ static int hva_to_pfn_slow(struct kvm_follow_pfn *kfp, kvm_pfn_t *pfn)
flags |= FOLL_WRITE;
}
+out:
*pfn = kvm_resolve_pfn(kfp, page, NULL, flags & FOLL_WRITE);
return npages;
}
@@ -3099,10 +3122,11 @@ int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map)
.slot = gfn_to_memslot(vcpu->kvm, gfn),
.gfn = gfn,
.flags = FOLL_WRITE,
- .refcounted_page = &map->refcounted_page,
+ .refcounted_page = &map->pinned_page,
+ .pin = true,
};
- map->refcounted_page = NULL;
+ map->pinned_page = NULL;
map->page = NULL;
map->hva = NULL;
map->gfn = gfn;
@@ -3139,16 +3163,16 @@ void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty)
if (dirty)
kvm_vcpu_mark_page_dirty(vcpu, map->gfn);
- if (map->refcounted_page) {
+ if (map->pinned_page) {
if (dirty)
- kvm_release_page_dirty(map->refcounted_page);
- else
- kvm_release_page_clean(map->refcounted_page);
+ kvm_set_page_dirty(map->pinned_page);
+ kvm_set_page_accessed(map->pinned_page);
+ unpin_user_page(map->pinned_page);
}
map->hva = NULL;
map->page = NULL;
- map->refcounted_page = NULL;
+ map->pinned_page = NULL;
}
EXPORT_SYMBOL_GPL(kvm_vcpu_unmap);
@@ -30,6 +30,13 @@ struct kvm_follow_pfn {
/* FOLL_* flags modifying lookup behavior, e.g. FOLL_WRITE. */
unsigned int flags;
+ /*
+ * Pin the page (effectively FOLL_PIN, which is an mm/ internal flag).
+ * The page *must* be pinned if KVM will write to the page via a kernel
+ * mapping, e.g. via kmap(), mremap(), etc.
+ */
+ bool pin;
+
/*
* If non-NULL, try to get a writable mapping even for a read fault.
* Set to true if a writable mapping was obtained.