diff mbox series

[RFC,v8,04/56] KVM: Add HVA range operator

Message ID 20230220183847.59159-5-michael.roth@amd.com (mailing list archive)
State Not Applicable
Delegated to: Herbert Xu
Headers show
Series Add AMD Secure Nested Paging (SEV-SNP) Hypervisor Support | expand

Commit Message

Michael Roth Feb. 20, 2023, 6:37 p.m. UTC
From: Vishal Annapurve <vannapurve@google.com>

Introduce HVA range operator so that other KVM subsystems
can operate on HVA range.

Signed-off-by: Vishal Annapurve <vannapurve@google.com>
[mdr: minor checkpatch alignment fixups]
Signed-off-by: Michael Roth <michael.roth@amd.com>
---
 include/linux/kvm_host.h |  6 +++++
 virt/kvm/kvm_main.c      | 48 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 54 insertions(+)

Comments

Zhi Wang Feb. 20, 2023, 9:37 p.m. UTC | #1
On Mon, 20 Feb 2023 12:37:55 -0600
Michael Roth <michael.roth@amd.com> wrote:

> From: Vishal Annapurve <vannapurve@google.com>
> 
> Introduce HVA range operator so that other KVM subsystems
> can operate on HVA range.
> 
> Signed-off-by: Vishal Annapurve <vannapurve@google.com>
> [mdr: minor checkpatch alignment fixups]
> Signed-off-by: Michael Roth <michael.roth@amd.com>
> ---
>  include/linux/kvm_host.h |  6 +++++
>  virt/kvm/kvm_main.c      | 48 ++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 54 insertions(+)
> 
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 4d542060cd93..c615650ed256 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -1402,6 +1402,12 @@ void kvm_mmu_invalidate_begin(struct kvm *kvm);
>  void kvm_mmu_invalidate_range_add(struct kvm *kvm, gfn_t start, gfn_t end);
>  void kvm_mmu_invalidate_end(struct kvm *kvm);
>  
> +typedef int (*kvm_hva_range_op_t)(struct kvm *kvm,
> +				struct kvm_gfn_range *range, void *data);
> +
> +int kvm_vm_do_hva_range_op(struct kvm *kvm, unsigned long hva_start,
> +			   unsigned long hva_end, kvm_hva_range_op_t handler, void *data);
> +
>  long kvm_arch_dev_ioctl(struct file *filp,
>  			unsigned int ioctl, unsigned long arg);
>  long kvm_arch_vcpu_ioctl(struct file *filp,
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index f7e00593cc5d..4ccd655dd5af 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -642,6 +642,54 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
>  	return (int)ret;
>  }
>  

Below function seems a reduced duplicate of __kvm_handle_hva_range()
in virt/kvm/kvm_main.c. It would be nice to factor __kvm_handle_hva_range().

> +int kvm_vm_do_hva_range_op(struct kvm *kvm, unsigned long hva_start,
> +			   unsigned long hva_end, kvm_hva_range_op_t handler, void *data)
> +{
> +	int ret = 0;
> +	struct kvm_gfn_range gfn_range;
> +	struct kvm_memory_slot *slot;
> +	struct kvm_memslots *slots;
> +	int i, idx;
> +
> +	if (WARN_ON_ONCE(hva_end <= hva_start))
> +		return -EINVAL;
> +
> +	idx = srcu_read_lock(&kvm->srcu);
> +
> +	for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
> +		struct interval_tree_node *node;
> +
> +		slots = __kvm_memslots(kvm, i);
> +		kvm_for_each_memslot_in_hva_range(node, slots,
> +						  hva_start, hva_end - 1) {
> +			unsigned long start, end;
> +
> +			slot = container_of(node, struct kvm_memory_slot,
> +					    hva_node[slots->node_idx]);
> +			start = max(hva_start, slot->userspace_addr);
> +			end = min(hva_end, slot->userspace_addr +
> +						  (slot->npages << PAGE_SHIFT));
> +
> +			/*
> +			 * {gfn(page) | page intersects with [hva_start, hva_end)} =
> +			 * {gfn_start, gfn_start+1, ..., gfn_end-1}.
> +			 */
> +			gfn_range.start = hva_to_gfn_memslot(start, slot);
> +			gfn_range.end = hva_to_gfn_memslot(end + PAGE_SIZE - 1, slot);
> +			gfn_range.slot = slot;
> +
> +			ret = handler(kvm, &gfn_range, data);
> +			if (ret)
> +				goto e_ret;
> +		}
> +	}
> +
> +e_ret:
> +	srcu_read_unlock(&kvm->srcu, idx);
> +
> +	return ret;
> +}
> +
>  static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn,
>  						unsigned long start,
>  						unsigned long end,
Michael Roth March 27, 2023, 12:34 a.m. UTC | #2
On Mon, Feb 20, 2023 at 11:37:09PM +0200, Zhi Wang wrote:
> On Mon, 20 Feb 2023 12:37:55 -0600
> Michael Roth <michael.roth@amd.com> wrote:
> 
> > From: Vishal Annapurve <vannapurve@google.com>
> > 
> > Introduce HVA range operator so that other KVM subsystems
> > can operate on HVA range.
> > 
> > Signed-off-by: Vishal Annapurve <vannapurve@google.com>
> > [mdr: minor checkpatch alignment fixups]
> > Signed-off-by: Michael Roth <michael.roth@amd.com>
> > ---
> >  include/linux/kvm_host.h |  6 +++++
> >  virt/kvm/kvm_main.c      | 48 ++++++++++++++++++++++++++++++++++++++++
> >  2 files changed, 54 insertions(+)
> > 
> > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > index 4d542060cd93..c615650ed256 100644
> > --- a/include/linux/kvm_host.h
> > +++ b/include/linux/kvm_host.h
> > @@ -1402,6 +1402,12 @@ void kvm_mmu_invalidate_begin(struct kvm *kvm);
> >  void kvm_mmu_invalidate_range_add(struct kvm *kvm, gfn_t start, gfn_t end);
> >  void kvm_mmu_invalidate_end(struct kvm *kvm);
> >  
> > +typedef int (*kvm_hva_range_op_t)(struct kvm *kvm,
> > +				struct kvm_gfn_range *range, void *data);
> > +
> > +int kvm_vm_do_hva_range_op(struct kvm *kvm, unsigned long hva_start,
> > +			   unsigned long hva_end, kvm_hva_range_op_t handler, void *data);
> > +
> >  long kvm_arch_dev_ioctl(struct file *filp,
> >  			unsigned int ioctl, unsigned long arg);
> >  long kvm_arch_vcpu_ioctl(struct file *filp,
> > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > index f7e00593cc5d..4ccd655dd5af 100644
> > --- a/virt/kvm/kvm_main.c
> > +++ b/virt/kvm/kvm_main.c
> > @@ -642,6 +642,54 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
> >  	return (int)ret;
> >  }
> >  
> 
> Below function seems a reduced duplicate of __kvm_handle_hva_range()
> in virt/kvm/kvm_main.c. It would be nice to factor __kvm_handle_hva_range().

A few differences make it difficult to refactor this clearly:

  - This handler is mainly used for loading initial contents into guest
    image before booting and doesn't rely on the MMU lock being held. It
    also *can't* be called with MMU lock held because it suffers from the
    same issue with mem_attr_update() hook where it needs to take a
    mutex as part of unmapping from directmap when transitioning page to
    private state in RMP table
  - This handler wants to return an error code, as opposed to existing
    handlers which return a true/false values which are passed along to
    MMU notifier call-site and handled differently.
  - This handler wants to terminate iterating through memslots as soon
    as it encounters the first failure, whereas the existing handlers
    expect to be called for each slot regardless of return value.

So it's a pretty different use-case that adds enough complexity to
__kvm_handle_hva_range() that it might need be worth refactoring it,
since it complicates some bits that are closely tied to dealing with
invalidations where the extra complexity probably needs to be
well-warranted.

I took a stab at it here for reference, but even with what seems to be
the minimal set of changes it doesn't save on any code and ultimately I
think it makes it harder to make sense of what going on:

  https://github.com/mdroth/linux/commit/976c5fb708f7babe899fd80e27e19f8ba3f6818d

Is there a better approach?

Thanks,

-Mike

> 
> > +int kvm_vm_do_hva_range_op(struct kvm *kvm, unsigned long hva_start,
> > +			   unsigned long hva_end, kvm_hva_range_op_t handler, void *data)
> > +{
> > +	int ret = 0;
> > +	struct kvm_gfn_range gfn_range;
> > +	struct kvm_memory_slot *slot;
> > +	struct kvm_memslots *slots;
> > +	int i, idx;
> > +
> > +	if (WARN_ON_ONCE(hva_end <= hva_start))
> > +		return -EINVAL;
> > +
> > +	idx = srcu_read_lock(&kvm->srcu);
> > +
> > +	for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
> > +		struct interval_tree_node *node;
> > +
> > +		slots = __kvm_memslots(kvm, i);
> > +		kvm_for_each_memslot_in_hva_range(node, slots,
> > +						  hva_start, hva_end - 1) {
> > +			unsigned long start, end;
> > +
> > +			slot = container_of(node, struct kvm_memory_slot,
> > +					    hva_node[slots->node_idx]);
> > +			start = max(hva_start, slot->userspace_addr);
> > +			end = min(hva_end, slot->userspace_addr +
> > +						  (slot->npages << PAGE_SHIFT));
> > +
> > +			/*
> > +			 * {gfn(page) | page intersects with [hva_start, hva_end)} =
> > +			 * {gfn_start, gfn_start+1, ..., gfn_end-1}.
> > +			 */
> > +			gfn_range.start = hva_to_gfn_memslot(start, slot);
> > +			gfn_range.end = hva_to_gfn_memslot(end + PAGE_SIZE - 1, slot);
> > +			gfn_range.slot = slot;
> > +
> > +			ret = handler(kvm, &gfn_range, data);
> > +			if (ret)
> > +				goto e_ret;
> > +		}
> > +	}
> > +
> > +e_ret:
> > +	srcu_read_unlock(&kvm->srcu, idx);
> > +
> > +	return ret;
> > +}
> > +
> >  static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn,
> >  						unsigned long start,
> >  						unsigned long end,
>
Zhi Wang April 4, 2023, 2:40 p.m. UTC | #3
On Sun, 26 Mar 2023 19:34:44 -0500
Michael Roth <michael.roth@amd.com> wrote:

> On Mon, Feb 20, 2023 at 11:37:09PM +0200, Zhi Wang wrote:
> > On Mon, 20 Feb 2023 12:37:55 -0600
> > Michael Roth <michael.roth@amd.com> wrote:
> > 
> > > From: Vishal Annapurve <vannapurve@google.com>
> > > 
> > > Introduce HVA range operator so that other KVM subsystems
> > > can operate on HVA range.
> > > 
> > > Signed-off-by: Vishal Annapurve <vannapurve@google.com>
> > > [mdr: minor checkpatch alignment fixups]
> > > Signed-off-by: Michael Roth <michael.roth@amd.com>
> > > ---
> > >  include/linux/kvm_host.h |  6 +++++
> > >  virt/kvm/kvm_main.c      | 48 ++++++++++++++++++++++++++++++++++++++++
> > >  2 files changed, 54 insertions(+)
> > > 
> > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > > index 4d542060cd93..c615650ed256 100644
> > > --- a/include/linux/kvm_host.h
> > > +++ b/include/linux/kvm_host.h
> > > @@ -1402,6 +1402,12 @@ void kvm_mmu_invalidate_begin(struct kvm *kvm);
> > >  void kvm_mmu_invalidate_range_add(struct kvm *kvm, gfn_t start, gfn_t end);
> > >  void kvm_mmu_invalidate_end(struct kvm *kvm);
> > >  
> > > +typedef int (*kvm_hva_range_op_t)(struct kvm *kvm,
> > > +				struct kvm_gfn_range *range, void *data);
> > > +
> > > +int kvm_vm_do_hva_range_op(struct kvm *kvm, unsigned long hva_start,
> > > +			   unsigned long hva_end, kvm_hva_range_op_t handler, void *data);
> > > +
> > >  long kvm_arch_dev_ioctl(struct file *filp,
> > >  			unsigned int ioctl, unsigned long arg);
> > >  long kvm_arch_vcpu_ioctl(struct file *filp,
> > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > > index f7e00593cc5d..4ccd655dd5af 100644
> > > --- a/virt/kvm/kvm_main.c
> > > +++ b/virt/kvm/kvm_main.c
> > > @@ -642,6 +642,54 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
> > >  	return (int)ret;
> > >  }
> > >  
> > 
> > Below function seems a reduced duplicate of __kvm_handle_hva_range()
> > in virt/kvm/kvm_main.c. It would be nice to factor __kvm_handle_hva_range().
> 
> A few differences make it difficult to refactor this clearly:
> 
>   - This handler is mainly used for loading initial contents into guest
>     image before booting and doesn't rely on the MMU lock being held. It
>     also *can't* be called with MMU lock held because it suffers from the
>     same issue with mem_attr_update() hook where it needs to take a
>     mutex as part of unmapping from directmap when transitioning page to
>     private state in RMP table
>   - This handler wants to return an error code, as opposed to existing
>     handlers which return a true/false values which are passed along to
>     MMU notifier call-site and handled differently.
>   - This handler wants to terminate iterating through memslots as soon
>     as it encounters the first failure, whereas the existing handlers
>     expect to be called for each slot regardless of return value.
> 
> So it's a pretty different use-case that adds enough complexity to
> __kvm_handle_hva_range() that it might need be worth refactoring it,
> since it complicates some bits that are closely tied to dealing with
> invalidations where the extra complexity probably needs to be
> well-warranted.
> 
> I took a stab at it here for reference, but even with what seems to be
> the minimal set of changes it doesn't save on any code and ultimately I
> think it makes it harder to make sense of what going on:
> 
>   https://github.com/mdroth/linux/commit/976c5fb708f7babe899fd80e27e19f8ba3f6818d
> 
> Is there a better approach?
> 

Those requirements looks pretty suitable for kvm_handle_hva_range(). Guess
we just need to extend the iterator a little bit.

My ideas:

1) Add a lock flag in struct kvm_hva_range to indicate if kvm_lock is required
or not during the iteration. Check the flag with if (!locked && hva_range.need_lock). Then the unlock part can be left un-touched.

2) Add an error code in struct kvm_gfn_range, the handler can set it so that __kvm_handle_hva_range() can check gfn_range.err after ret|= handler(xxx);
If the err is set, bail out.

3) Return the gfn_range.err to the caller. The caller can decide how to convert
it (to boolean or keep it)

4) Set hva_range.need_lock in the existing and the new caller.

How about this?

> Thanks,
> 
> -Mike
> 
> > 
> > > +int kvm_vm_do_hva_range_op(struct kvm *kvm, unsigned long hva_start,
> > > +			   unsigned long hva_end, kvm_hva_range_op_t handler, void *data)
> > > +{
> > > +	int ret = 0;
> > > +	struct kvm_gfn_range gfn_range;
> > > +	struct kvm_memory_slot *slot;
> > > +	struct kvm_memslots *slots;
> > > +	int i, idx;
> > > +
> > > +	if (WARN_ON_ONCE(hva_end <= hva_start))
> > > +		return -EINVAL;
> > > +
> > > +	idx = srcu_read_lock(&kvm->srcu);
> > > +
> > > +	for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
> > > +		struct interval_tree_node *node;
> > > +
> > > +		slots = __kvm_memslots(kvm, i);
> > > +		kvm_for_each_memslot_in_hva_range(node, slots,
> > > +						  hva_start, hva_end - 1) {
> > > +			unsigned long start, end;
> > > +
> > > +			slot = container_of(node, struct kvm_memory_slot,
> > > +					    hva_node[slots->node_idx]);
> > > +			start = max(hva_start, slot->userspace_addr);
> > > +			end = min(hva_end, slot->userspace_addr +
> > > +						  (slot->npages << PAGE_SHIFT));
> > > +
> > > +			/*
> > > +			 * {gfn(page) | page intersects with [hva_start, hva_end)} =
> > > +			 * {gfn_start, gfn_start+1, ..., gfn_end-1}.
> > > +			 */
> > > +			gfn_range.start = hva_to_gfn_memslot(start, slot);
> > > +			gfn_range.end = hva_to_gfn_memslot(end + PAGE_SIZE - 1, slot);
> > > +			gfn_range.slot = slot;
> > > +
> > > +			ret = handler(kvm, &gfn_range, data);
> > > +			if (ret)
> > > +				goto e_ret;
> > > +		}
> > > +	}
> > > +
> > > +e_ret:
> > > +	srcu_read_unlock(&kvm->srcu, idx);
> > > +
> > > +	return ret;
> > > +}
> > > +
> > >  static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn,
> > >  						unsigned long start,
> > >  						unsigned long end,
> >
diff mbox series

Patch

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 4d542060cd93..c615650ed256 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1402,6 +1402,12 @@  void kvm_mmu_invalidate_begin(struct kvm *kvm);
 void kvm_mmu_invalidate_range_add(struct kvm *kvm, gfn_t start, gfn_t end);
 void kvm_mmu_invalidate_end(struct kvm *kvm);
 
+typedef int (*kvm_hva_range_op_t)(struct kvm *kvm,
+				struct kvm_gfn_range *range, void *data);
+
+int kvm_vm_do_hva_range_op(struct kvm *kvm, unsigned long hva_start,
+			   unsigned long hva_end, kvm_hva_range_op_t handler, void *data);
+
 long kvm_arch_dev_ioctl(struct file *filp,
 			unsigned int ioctl, unsigned long arg);
 long kvm_arch_vcpu_ioctl(struct file *filp,
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index f7e00593cc5d..4ccd655dd5af 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -642,6 +642,54 @@  static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
 	return (int)ret;
 }
 
+int kvm_vm_do_hva_range_op(struct kvm *kvm, unsigned long hva_start,
+			   unsigned long hva_end, kvm_hva_range_op_t handler, void *data)
+{
+	int ret = 0;
+	struct kvm_gfn_range gfn_range;
+	struct kvm_memory_slot *slot;
+	struct kvm_memslots *slots;
+	int i, idx;
+
+	if (WARN_ON_ONCE(hva_end <= hva_start))
+		return -EINVAL;
+
+	idx = srcu_read_lock(&kvm->srcu);
+
+	for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
+		struct interval_tree_node *node;
+
+		slots = __kvm_memslots(kvm, i);
+		kvm_for_each_memslot_in_hva_range(node, slots,
+						  hva_start, hva_end - 1) {
+			unsigned long start, end;
+
+			slot = container_of(node, struct kvm_memory_slot,
+					    hva_node[slots->node_idx]);
+			start = max(hva_start, slot->userspace_addr);
+			end = min(hva_end, slot->userspace_addr +
+						  (slot->npages << PAGE_SHIFT));
+
+			/*
+			 * {gfn(page) | page intersects with [hva_start, hva_end)} =
+			 * {gfn_start, gfn_start+1, ..., gfn_end-1}.
+			 */
+			gfn_range.start = hva_to_gfn_memslot(start, slot);
+			gfn_range.end = hva_to_gfn_memslot(end + PAGE_SIZE - 1, slot);
+			gfn_range.slot = slot;
+
+			ret = handler(kvm, &gfn_range, data);
+			if (ret)
+				goto e_ret;
+		}
+	}
+
+e_ret:
+	srcu_read_unlock(&kvm->srcu, idx);
+
+	return ret;
+}
+
 static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn,
 						unsigned long start,
 						unsigned long end,