diff mbox series

[3/3] KVM: gmem: track preparedness a page at a time

Message ID 20241108155056.332412-4-pbonzini@redhat.com (mailing list archive)
State New
Headers show
Series KVM: gmem: track preparedness a page at a time | expand

Commit Message

Paolo Bonzini Nov. 8, 2024, 3:50 p.m. UTC
With support for large pages in gmem, it may happen that part of the gmem
is mapped with large pages and part with 4k pages.  For example, if a
conversion happens on a small region within a large page, the large
page has to be smashed into small pages even if backed by a large folio.
Each of the small pages will have its own state of preparedness,
which makes it harder to use the uptodate flag for preparedness.

Just switch to a bitmap in the inode's i_private data.  This is
a bit gnarly because ordinary bitmap operations in Linux are not
atomic, but otherwise not too hard.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 virt/kvm/guest_memfd.c | 103 +++++++++++++++++++++++++++++++++++++++--
 1 file changed, 100 insertions(+), 3 deletions(-)

Comments

Sean Christopherson Nov. 14, 2024, 1:31 a.m. UTC | #1
On Fri, Nov 08, 2024, Paolo Bonzini wrote:
> With support for large pages in gmem, it may happen that part of the gmem
> is mapped with large pages and part with 4k pages.  For example, if a
> conversion happens on a small region within a large page, the large
> page has to be smashed into small pages even if backed by a large folio.
> Each of the small pages will have its own state of preparedness,
> which makes it harder to use the uptodate flag for preparedness.
> 
> Just switch to a bitmap in the inode's i_private data.  This is
> a bit gnarly because ordinary bitmap operations in Linux are not
> atomic, but otherwise not too hard.
> 
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>  virt/kvm/guest_memfd.c | 103 +++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 100 insertions(+), 3 deletions(-)
> 
> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> index 416e02a00cae..e08503dfdd8a 100644
> --- a/virt/kvm/guest_memfd.c
> +++ b/virt/kvm/guest_memfd.c
> @@ -68,8 +68,13 @@ static struct file *kvm_gmem_create_file(const char *name, const struct file_ope
>  }
>  
>  
> +#define KVM_GMEM_INODE_SIZE(size)			\
> +	struct_size_t(struct kvm_gmem_inode, prepared,	\
> +		      DIV_ROUND_UP(size, PAGE_SIZE * BITS_PER_LONG))
> +
>  struct kvm_gmem_inode {
>  	unsigned long flags;
> +	unsigned long prepared[];
>  };
>  
>  struct kvm_gmem {
> @@ -107,18 +112,110 @@ static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slo
>  	return 0;
>  }
>  
> +/*
> + * The bitmap of prepared pages has to be accessed atomically, because
> + * preparation is not protected by any guest.

Not entirely sure what you intended to say here.  KVM should never rely on the
guest to provide protection for anything :-)

> +   This unfortunately means that we cannot use regular bitmap operations.

No "we" please.

> + * The logic becomes a bit simpler for set and test, which operate a
> + * folio at a time and therefore can assume that the range is naturally
> + * aligned (meaning that either it is smaller than a word, or it is does
> + * not include fractions of a word).  For punch-hole operations however
> + * there is all the complexity.
> + */

This entire comment feels like it belongs in the changelog.

> +
> +static void bitmap_set_atomic_word(unsigned long *p, unsigned long start, unsigned long len)

I vote to keep wrapping at ~80 chars by default.  I have no objection to running
slightly past 80 when the result is a net postive, and in some cases prefer to do
so, but as a general rule I like wrapping at ~80 better than wrapping at ~100.

> +{
> +	unsigned long mask_to_set =
> +		BITMAP_FIRST_WORD_MASK(start) & BITMAP_LAST_WORD_MASK(start + len);

Align the right hand side?  E.g.

	unsigned long mask_to_set = BITMAP_FIRST_WORD_MASK(start) &
				    BITMAP_LAST_WORD_MASK(start + len);

Though I think it makes sense to add a macro to generate the mask, maybe with an
assert that the inputs are sane?  E.g. something like BITMAP_WORD_MASK().

> +
> +	atomic_long_or(mask_to_set, (atomic_long_t *)p);
> +}
> +
> +static void bitmap_clear_atomic_word(unsigned long *p, unsigned long start, unsigned long len)

If these are going to stay in KVM, I think it would be better to force the caller
to cast to atomic_long_t.  Actually, I think kvm_gmem_inode.prepared itself should
be an array of atomic_long_t.  More below.

> +{
> +	unsigned long mask_to_set =

mask_to_clear, though if we add a macro, this could probably be something like:

	atomic_long_andnot(BITMAP_WORD_MASK(start, len), p);

> +		BITMAP_FIRST_WORD_MASK(start) & BITMAP_LAST_WORD_MASK(start + len);
> +
> +	atomic_long_andnot(mask_to_set, (atomic_long_t *)p);
> +}
> +
> +static bool bitmap_test_allset_word(unsigned long *p, unsigned long start, unsigned long len)
> +{
> +	unsigned long mask_to_set =

mask_to_test, though using a macro would make the local variables go away.  At
that point, it might even make sense to forego the helpers entirely.

> +		BITMAP_FIRST_WORD_MASK(start) & BITMAP_LAST_WORD_MASK(start + len);
> +
> +	return (*p & mask_to_set) == mask_to_set;

Dereferencing *p should be READ_ONCE(), no?  And if everything is tracked as
atomic_long_t, then KVM is forced to do the right thing via atomic_long_read().

> +}
> +
>  static void kvm_gmem_mark_prepared(struct file *file, pgoff_t index, struct folio *folio)
>  {
> -	folio_mark_uptodate(folio);
> +	struct kvm_gmem_inode *i_gmem = (struct kvm_gmem_inode *)file->f_inode->i_private;
> +	unsigned long *p = i_gmem->prepared + BIT_WORD(index);
> +	unsigned long npages = folio_nr_pages(folio);
> +
> +	/* Folios must be naturally aligned */
> +	WARN_ON_ONCE(index & (npages - 1));
> +	index &= ~(npages - 1);
> +
> +	/* Clear page before updating bitmap.  */
> +	smp_wmb();
> +
> +	if (npages < BITS_PER_LONG) {
> +		bitmap_set_atomic_word(p, index, npages);
> +	} else {
> +		BUILD_BUG_ON(BITS_PER_LONG != 64);
> +		memset64((u64 *)p, ~0, BITS_TO_LONGS(npages));

More code that could be deduplicated (unprepared path below).

But more importantly, I'm pretty sure the entire lockless approach is unsafe.

Callers of kvm_gmem_get_pfn() do not take any locks that protect kvm_gmem_mark_prepared()
from racing with kvm_gmem_mark_range_unprepared().  kvm_mmu_invalidate_begin()
prevents KVM from installing a stage-2 mapping, i.e. from consuming the PFN, but
it doesn't prevent kvm_gmem_mark_prepared() from being called.  And
sev_handle_rmp_fault() never checks mmu_notifiers, which is probably fine?  But
sketchy.

Oof.  Side topic.  sev_handle_rmp_fault() is suspect for other reasons.  It does
its own lookup of the PFN, and so could see an entirely different PFN than was
resolved by kvm_mmu_page_fault().  And thanks to KVM's wonderful 0/1/-errno
behavior, sev_handle_rmp_fault() is invoked even when KVM wants to retry the
fault, e.g. due to an active MMU invalidation.

Anyways, KVM wouldn't _immediately_ consume bad data, as the invalidation
would block the current page fault.  But clobbering i_gmem->prepared would result
in a missed "prepare" phase if the hole-punch region were restored.

One idea would be to use a rwlock to protect updates to the bitmap (setters can
generally stomp on each other).  And to avoid contention whenever possible in
page fault paths, only take the lock if the page is not up-to-date, because again
kvm_mmu_invalidate_{begin,end}() protects against UAF, it's purely updates to the
bitmap that need extra protection.

Note, using is mmu_invalidate_retry_gfn() is unsafe, because it must be called
under mmu_lock to ensure it doesn't get false negatives.

> +	}
>  }
>  
>  static void kvm_gmem_mark_range_unprepared(struct inode *inode, pgoff_t index, pgoff_t npages)
>  {
> +	struct kvm_gmem_inode *i_gmem = (struct kvm_gmem_inode *)inode->i_private;
> +	unsigned long *p = i_gmem->prepared + BIT_WORD(index);
> +
> +	index &= BITS_PER_LONG - 1;
> +	if (index) {
> +		int first_word_count = min(npages, BITS_PER_LONG - index);

Newline.

> +		bitmap_clear_atomic_word(p, index, first_word_count);
> +		npages -= first_word_count;
> +		p++;
> +	}
> +
> +	if (npages > BITS_PER_LONG) {
> +		BUILD_BUG_ON(BITS_PER_LONG != 64);
> +		memset64((u64 *)p, 0, BITS_TO_LONGS(npages));
> +		p += BIT_WORD(npages);
> +		npages &= BITS_PER_LONG - 1;
> +	}
> +
> +	if (npages)
> +		bitmap_clear_atomic_word(p++, 0, npages);
>  }
>  
>  static bool kvm_gmem_is_prepared(struct file *file, pgoff_t index, struct folio *folio)
>  {
> -	return folio_test_uptodate(folio);
> +	struct kvm_gmem_inode *i_gmem = (struct kvm_gmem_inode *)file->f_inode->i_private;
> +	unsigned long *p = i_gmem->prepared + BIT_WORD(index);
> +	unsigned long npages = folio_nr_pages(folio);
> +	bool ret;
> +
> +	/* Folios must be naturally aligned */
> +	WARN_ON_ONCE(index & (npages - 1));
> +	index &= ~(npages - 1);
> +
> +	if (npages < BITS_PER_LONG) {
> +		ret = bitmap_test_allset_word(p, index, npages);
> +	} else {
> +		for (; npages > 0; npages -= BITS_PER_LONG)

for-loop needs curly braces.

> +			if (*p++ != ~0)

Presumably a READ_ONCE() here as well.

> +				break;
> +		ret = (npages == 0);
> +	}
> +
> +	/* Synchronize with kvm_gmem_mark_prepared().  */
> +	smp_rmb();
> +	return ret;
>  }
>  
>  /*
> @@ -499,7 +596,7 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
>  	struct file *file;
>  	int fd, err;
>  
> -	i_gmem = kvzalloc(sizeof(struct kvm_gmem_inode), GFP_KERNEL);
> +	i_gmem = kvzalloc(KVM_GMEM_INODE_SIZE(size), GFP_KERNEL);
>  	if (!i_gmem)
>  		return -ENOMEM;
>  	i_gmem->flags = flags;
> -- 
> 2.43.5
>
Sean Christopherson Nov. 14, 2024, 1:41 a.m. UTC | #2
On Wed, Nov 13, 2024, Sean Christopherson wrote:
> On Fri, Nov 08, 2024, Paolo Bonzini wrote:
> >  static void kvm_gmem_mark_prepared(struct file *file, pgoff_t index, struct folio *folio)
> >  {
> > -	folio_mark_uptodate(folio);
> > +	struct kvm_gmem_inode *i_gmem = (struct kvm_gmem_inode *)file->f_inode->i_private;
> > +	unsigned long *p = i_gmem->prepared + BIT_WORD(index);
> > +	unsigned long npages = folio_nr_pages(folio);
> > +
> > +	/* Folios must be naturally aligned */
> > +	WARN_ON_ONCE(index & (npages - 1));
> > +	index &= ~(npages - 1);
> > +
> > +	/* Clear page before updating bitmap.  */
> > +	smp_wmb();
> > +
> > +	if (npages < BITS_PER_LONG) {
> > +		bitmap_set_atomic_word(p, index, npages);
> > +	} else {
> > +		BUILD_BUG_ON(BITS_PER_LONG != 64);
> > +		memset64((u64 *)p, ~0, BITS_TO_LONGS(npages));
> 
> More code that could be deduplicated (unprepared path below).
> 
> But more importantly, I'm pretty sure the entire lockless approach is unsafe.
> 
> Callers of kvm_gmem_get_pfn() do not take any locks that protect kvm_gmem_mark_prepared()
> from racing with kvm_gmem_mark_range_unprepared().  kvm_mmu_invalidate_begin()
> prevents KVM from installing a stage-2 mapping, i.e. from consuming the PFN, but
> it doesn't prevent kvm_gmem_mark_prepared() from being called.  And
> sev_handle_rmp_fault() never checks mmu_notifiers, which is probably fine?  But
> sketchy.
> 
> Oof.  Side topic.  sev_handle_rmp_fault() is suspect for other reasons.  It does
> its own lookup of the PFN, and so could see an entirely different PFN than was
> resolved by kvm_mmu_page_fault().  And thanks to KVM's wonderful 0/1/-errno
> behavior, sev_handle_rmp_fault() is invoked even when KVM wants to retry the
> fault, e.g. due to an active MMU invalidation.
> 
> Anyways, KVM wouldn't _immediately_ consume bad data, as the invalidation
> would block the current page fault.  But clobbering i_gmem->prepared would result
> in a missed "prepare" phase if the hole-punch region were restored.
> 
> One idea would be to use a rwlock to protect updates to the bitmap (setters can
> generally stomp on each other).  And to avoid contention whenever possible in
> page fault paths, only take the lock if the page is not up-to-date, because again
> kvm_mmu_invalidate_{begin,end}() protects against UAF, it's purely updates to the
> bitmap that need extra protection.

Actually, there's no point in having a rwlock, because readers are serialized on
the folio's lock.  And KVM absolutely relies on that already, because otherwise
multiple vCPUs could see an unprepared folio, and clear_highpage() could end up
racing with writes from the vCPU.

> Note, using is mmu_invalidate_retry_gfn() is unsafe, because it must be called
> under mmu_lock to ensure it doesn't get false negatives.
diff mbox series

Patch

diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 416e02a00cae..e08503dfdd8a 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -68,8 +68,13 @@  static struct file *kvm_gmem_create_file(const char *name, const struct file_ope
 }
 
 
+#define KVM_GMEM_INODE_SIZE(size)			\
+	struct_size_t(struct kvm_gmem_inode, prepared,	\
+		      DIV_ROUND_UP(size, PAGE_SIZE * BITS_PER_LONG))
+
 struct kvm_gmem_inode {
 	unsigned long flags;
+	unsigned long prepared[];
 };
 
 struct kvm_gmem {
@@ -107,18 +112,110 @@  static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slo
 	return 0;
 }
 
+/*
+ * The bitmap of prepared pages has to be accessed atomically, because
+ * preparation is not protected by any guest.  This unfortunately means
+ * that we cannot use regular bitmap operations.
+ *
+ * The logic becomes a bit simpler for set and test, which operate a
+ * folio at a time and therefore can assume that the range is naturally
+ * aligned (meaning that either it is smaller than a word, or it is does
+ * not include fractions of a word).  For punch-hole operations however
+ * there is all the complexity.
+ */
+
+static void bitmap_set_atomic_word(unsigned long *p, unsigned long start, unsigned long len)
+{
+	unsigned long mask_to_set =
+		BITMAP_FIRST_WORD_MASK(start) & BITMAP_LAST_WORD_MASK(start + len);
+
+	atomic_long_or(mask_to_set, (atomic_long_t *)p);
+}
+
+static void bitmap_clear_atomic_word(unsigned long *p, unsigned long start, unsigned long len)
+{
+	unsigned long mask_to_set =
+		BITMAP_FIRST_WORD_MASK(start) & BITMAP_LAST_WORD_MASK(start + len);
+
+	atomic_long_andnot(mask_to_set, (atomic_long_t *)p);
+}
+
+static bool bitmap_test_allset_word(unsigned long *p, unsigned long start, unsigned long len)
+{
+	unsigned long mask_to_set =
+		BITMAP_FIRST_WORD_MASK(start) & BITMAP_LAST_WORD_MASK(start + len);
+
+	return (*p & mask_to_set) == mask_to_set;
+}
+
 static void kvm_gmem_mark_prepared(struct file *file, pgoff_t index, struct folio *folio)
 {
-	folio_mark_uptodate(folio);
+	struct kvm_gmem_inode *i_gmem = (struct kvm_gmem_inode *)file->f_inode->i_private;
+	unsigned long *p = i_gmem->prepared + BIT_WORD(index);
+	unsigned long npages = folio_nr_pages(folio);
+
+	/* Folios must be naturally aligned */
+	WARN_ON_ONCE(index & (npages - 1));
+	index &= ~(npages - 1);
+
+	/* Clear page before updating bitmap.  */
+	smp_wmb();
+
+	if (npages < BITS_PER_LONG) {
+		bitmap_set_atomic_word(p, index, npages);
+	} else {
+		BUILD_BUG_ON(BITS_PER_LONG != 64);
+		memset64((u64 *)p, ~0, BITS_TO_LONGS(npages));
+	}
 }
 
 static void kvm_gmem_mark_range_unprepared(struct inode *inode, pgoff_t index, pgoff_t npages)
 {
+	struct kvm_gmem_inode *i_gmem = (struct kvm_gmem_inode *)inode->i_private;
+	unsigned long *p = i_gmem->prepared + BIT_WORD(index);
+
+	index &= BITS_PER_LONG - 1;
+	if (index) {
+		int first_word_count = min(npages, BITS_PER_LONG - index);
+		bitmap_clear_atomic_word(p, index, first_word_count);
+		npages -= first_word_count;
+		p++;
+	}
+
+	if (npages > BITS_PER_LONG) {
+		BUILD_BUG_ON(BITS_PER_LONG != 64);
+		memset64((u64 *)p, 0, BITS_TO_LONGS(npages));
+		p += BIT_WORD(npages);
+		npages &= BITS_PER_LONG - 1;
+	}
+
+	if (npages)
+		bitmap_clear_atomic_word(p++, 0, npages);
 }
 
 static bool kvm_gmem_is_prepared(struct file *file, pgoff_t index, struct folio *folio)
 {
-	return folio_test_uptodate(folio);
+	struct kvm_gmem_inode *i_gmem = (struct kvm_gmem_inode *)file->f_inode->i_private;
+	unsigned long *p = i_gmem->prepared + BIT_WORD(index);
+	unsigned long npages = folio_nr_pages(folio);
+	bool ret;
+
+	/* Folios must be naturally aligned */
+	WARN_ON_ONCE(index & (npages - 1));
+	index &= ~(npages - 1);
+
+	if (npages < BITS_PER_LONG) {
+		ret = bitmap_test_allset_word(p, index, npages);
+	} else {
+		for (; npages > 0; npages -= BITS_PER_LONG)
+			if (*p++ != ~0)
+				break;
+		ret = (npages == 0);
+	}
+
+	/* Synchronize with kvm_gmem_mark_prepared().  */
+	smp_rmb();
+	return ret;
 }
 
 /*
@@ -499,7 +596,7 @@  static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
 	struct file *file;
 	int fd, err;
 
-	i_gmem = kvzalloc(sizeof(struct kvm_gmem_inode), GFP_KERNEL);
+	i_gmem = kvzalloc(KVM_GMEM_INODE_SIZE(size), GFP_KERNEL);
 	if (!i_gmem)
 		return -ENOMEM;
 	i_gmem->flags = flags;