diff mbox series

[v5,08/13] KVM: Use memfile_pfn_ops to obtain pfn for private pages

Message ID 20220310140911.50924-9-chao.p.peng@linux.intel.com (mailing list archive)
State New
Headers show
Series KVM: mm: fd-based approach for supporting KVM guest private memory | expand

Commit Message

Chao Peng March 10, 2022, 2:09 p.m. UTC
Private pages are not mmap-ed into userspace so can not reply on
get_user_pages() to obtain the pfn. Instead we add a memfile_pfn_ops
pointer pfn_ops in each private memslot and use it to obtain the pfn
for a gfn. To do that, KVM should convert the gfn to the offset into
the fd and then call get_lock_pfn callback. Once KVM completes its job
it should call put_unlock_pfn to unlock the pfn. Note the pfn(page) is
locked between get_lock_pfn/put_unlock_pfn to ensure pfn is valid when
KVM uses it to establish the mapping in the secondary MMU page table.

The pfn_ops is initialized via memfile_register_notifier from the memory
backing store that provided the private_fd.

Signed-off-by: Yu Zhang <yu.c.zhang@linux.intel.com>
Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
---
 arch/x86/kvm/Kconfig     |  1 +
 include/linux/kvm_host.h | 33 +++++++++++++++++++++++++++++++++
 2 files changed, 34 insertions(+)

Comments

Sean Christopherson March 28, 2022, 11:56 p.m. UTC | #1
On Thu, Mar 10, 2022, Chao Peng wrote:
> @@ -2217,4 +2220,34 @@ static inline void kvm_handle_signal_exit(struct kvm_vcpu *vcpu)
>  /* Max number of entries allowed for each kvm dirty ring */
>  #define  KVM_DIRTY_RING_MAX_ENTRIES  65536
>  
> +#ifdef CONFIG_MEMFILE_NOTIFIER
> +static inline long kvm_memfile_get_pfn(struct kvm_memory_slot *slot, gfn_t gfn,
> +				       int *order)
> +{
> +	pgoff_t index = gfn - slot->base_gfn +
> +			(slot->private_offset >> PAGE_SHIFT);

This is broken for 32-bit kernels, where gfn_t is a 64-bit value but pgoff_t is a
32-bit value.  There's no reason to support this for 32-bit kernels, so...

The easiest fix, and likely most maintainable for other code too, would be to
add a dedicated CONFIG for private memory, and then have KVM check that for all
the memfile stuff.  x86 can then select it only for 64-bit kernels, and in turn
select MEMFILE_NOTIFIER iff private memory is supported.

diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index ca7b2a6a452a..ee9c8c155300 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -48,7 +48,9 @@ config KVM
        select SRCU
        select INTERVAL_TREE
        select HAVE_KVM_PM_NOTIFIER if PM
-       select MEMFILE_NOTIFIER
+       select HAVE_KVM_PRIVATE_MEM if X86_64
+       select MEMFILE_NOTIFIER if HAVE_KVM_PRIVATE_MEM
+
        help
          Support hosting fully virtualized guest machines using hardware
          virtualization extensions.  You will need a fairly recent

And in addition to replacing checks on CONFIG_MEMFILE_NOTIFIER, the probing of
whether or not KVM_MEM_PRIVATE is allowed can be:

@@ -1499,23 +1499,19 @@ static void kvm_replace_memslot(struct kvm *kvm,
        }
 }

-bool __weak kvm_arch_private_memory_supported(struct kvm *kvm)
-{
-       return false;
-}
-
 static int check_memory_region_flags(struct kvm *kvm,
                                const struct kvm_userspace_memory_region *mem)
 {
        u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;

-       if (kvm_arch_private_memory_supported(kvm))
-               valid_flags |= KVM_MEM_PRIVATE;
-
 #ifdef __KVM_HAVE_READONLY_MEM
        valid_flags |= KVM_MEM_READONLY;
 #endif

+#ifdef CONFIG_KVM_HAVE_PRIVATE_MEM
+       valid_flags |= KVM_MEM_PRIVATE;
+#endif
+
        if (mem->flags & ~valid_flags)
                return -EINVAL;

> +
> +	return slot->pfn_ops->get_lock_pfn(file_inode(slot->private_file),
> +					   index, order);

In a similar vein, get_locK_pfn() shouldn't return a "long".  KVM likely won't use
these APIs on 32-bit kernels, but that may not hold true for other subsystems, and
this code is confusing and technically wrong.  The pfns for struct page squeeze
into an unsigned long because PAE support is capped at 64gb, but casting to a
signed long could result in a pfn with bit 31 set being misinterpreted as an error.

Even returning an "unsigned long" for the pfn is wrong.  It "works" for the shmem
code because shmem deals only with struct page, but it's technically wrong, especially
since one of the selling points of this approach is that it can work without struct
page.

OUT params suck, but I don't see a better option than having the return value be
0/-errno, with "pfn_t *pfn" for the resolved pfn.

> +}
> +
> +static inline void kvm_memfile_put_pfn(struct kvm_memory_slot *slot,
> +				       kvm_pfn_t pfn)
> +{
> +	slot->pfn_ops->put_unlock_pfn(pfn);
> +}
> +
> +#else
> +static inline long kvm_memfile_get_pfn(struct kvm_memory_slot *slot, gfn_t gfn,
> +				       int *order)
> +{

This should be a WARN_ON() as its usage should be guarded by a KVM_PRIVATE_MEM
check, and private memslots should be disallowed in this case.

Alternatively, it might be a good idea to #ifdef these out entirely and not provide
stubs.  That'd likely require a stub or two in arch code, but overall it might be
less painful in the long run, e.g. would force us to more carefully consider the
touch points for private memory.  Definitely not a requirement, just an idea.
Chao Peng April 8, 2022, 2:07 p.m. UTC | #2
On Mon, Mar 28, 2022 at 11:56:06PM +0000, Sean Christopherson wrote:
> On Thu, Mar 10, 2022, Chao Peng wrote:
> > @@ -2217,4 +2220,34 @@ static inline void kvm_handle_signal_exit(struct kvm_vcpu *vcpu)
> >  /* Max number of entries allowed for each kvm dirty ring */
> >  #define  KVM_DIRTY_RING_MAX_ENTRIES  65536
> >  
> > +#ifdef CONFIG_MEMFILE_NOTIFIER
> > +static inline long kvm_memfile_get_pfn(struct kvm_memory_slot *slot, gfn_t gfn,
> > +				       int *order)
> > +{
> > +	pgoff_t index = gfn - slot->base_gfn +
> > +			(slot->private_offset >> PAGE_SHIFT);
> 
> This is broken for 32-bit kernels, where gfn_t is a 64-bit value but pgoff_t is a
> 32-bit value.  There's no reason to support this for 32-bit kernels, so...
> 
> The easiest fix, and likely most maintainable for other code too, would be to
> add a dedicated CONFIG for private memory, and then have KVM check that for all
> the memfile stuff.  x86 can then select it only for 64-bit kernels, and in turn
> select MEMFILE_NOTIFIER iff private memory is supported.

Looks good.

> 
> diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
> index ca7b2a6a452a..ee9c8c155300 100644
> --- a/arch/x86/kvm/Kconfig
> +++ b/arch/x86/kvm/Kconfig
> @@ -48,7 +48,9 @@ config KVM
>         select SRCU
>         select INTERVAL_TREE
>         select HAVE_KVM_PM_NOTIFIER if PM
> -       select MEMFILE_NOTIFIER
> +       select HAVE_KVM_PRIVATE_MEM if X86_64
> +       select MEMFILE_NOTIFIER if HAVE_KVM_PRIVATE_MEM
> +
>         help
>           Support hosting fully virtualized guest machines using hardware
>           virtualization extensions.  You will need a fairly recent
> 
> And in addition to replacing checks on CONFIG_MEMFILE_NOTIFIER, the probing of
> whether or not KVM_MEM_PRIVATE is allowed can be:
> 
> @@ -1499,23 +1499,19 @@ static void kvm_replace_memslot(struct kvm *kvm,
>         }
>  }
> 
> -bool __weak kvm_arch_private_memory_supported(struct kvm *kvm)
> -{
> -       return false;
> -}
> -
>  static int check_memory_region_flags(struct kvm *kvm,
>                                 const struct kvm_userspace_memory_region *mem)
>  {
>         u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
> 
> -       if (kvm_arch_private_memory_supported(kvm))
> -               valid_flags |= KVM_MEM_PRIVATE;
> -
>  #ifdef __KVM_HAVE_READONLY_MEM
>         valid_flags |= KVM_MEM_READONLY;
>  #endif
> 
> +#ifdef CONFIG_KVM_HAVE_PRIVATE_MEM
> +       valid_flags |= KVM_MEM_PRIVATE;
> +#endif
> +
>         if (mem->flags & ~valid_flags)
>                 return -EINVAL;
> 
> > +
> > +	return slot->pfn_ops->get_lock_pfn(file_inode(slot->private_file),
> > +					   index, order);
> 
> In a similar vein, get_locK_pfn() shouldn't return a "long".  KVM likely won't use
> these APIs on 32-bit kernels, but that may not hold true for other subsystems, and
> this code is confusing and technically wrong.  The pfns for struct page squeeze
> into an unsigned long because PAE support is capped at 64gb, but casting to a
> signed long could result in a pfn with bit 31 set being misinterpreted as an error.
> 
> Even returning an "unsigned long" for the pfn is wrong.  It "works" for the shmem
> code because shmem deals only with struct page, but it's technically wrong, especially
> since one of the selling points of this approach is that it can work without struct
> page.

Hmmm, that's correct.

> 
> OUT params suck, but I don't see a better option than having the return value be
> 0/-errno, with "pfn_t *pfn" for the resolved pfn.
> 
> > +}
> > +
> > +static inline void kvm_memfile_put_pfn(struct kvm_memory_slot *slot,
> > +				       kvm_pfn_t pfn)
> > +{
> > +	slot->pfn_ops->put_unlock_pfn(pfn);
> > +}
> > +
> > +#else
> > +static inline long kvm_memfile_get_pfn(struct kvm_memory_slot *slot, gfn_t gfn,
> > +				       int *order)
> > +{
> 
> This should be a WARN_ON() as its usage should be guarded by a KVM_PRIVATE_MEM
> check, and private memslots should be disallowed in this case.
> 
> Alternatively, it might be a good idea to #ifdef these out entirely and not provide
> stubs.  That'd likely require a stub or two in arch code, but overall it might be
> less painful in the long run, e.g. would force us to more carefully consider the
> touch points for private memory.  Definitely not a requirement, just an idea.

Make sense, let me try.

Thanks,
Chao
Chao Peng April 28, 2022, 12:37 p.m. UTC | #3
On Mon, Mar 28, 2022 at 11:56:06PM +0000, Sean Christopherson wrote:
> On Thu, Mar 10, 2022, Chao Peng wrote:
> > @@ -2217,4 +2220,34 @@ static inline void kvm_handle_signal_exit(struct kvm_vcpu *vcpu)
> >  /* Max number of entries allowed for each kvm dirty ring */
> >  #define  KVM_DIRTY_RING_MAX_ENTRIES  65536
> >  
> > +#ifdef CONFIG_MEMFILE_NOTIFIER
> > +static inline long kvm_memfile_get_pfn(struct kvm_memory_slot *slot, gfn_t gfn,
> > +				       int *order)
> > +{
> > +	pgoff_t index = gfn - slot->base_gfn +
> > +			(slot->private_offset >> PAGE_SHIFT);
> 
> This is broken for 32-bit kernels, where gfn_t is a 64-bit value but pgoff_t is a
> 32-bit value.  There's no reason to support this for 32-bit kernels, so...
> 
> The easiest fix, and likely most maintainable for other code too, would be to
> add a dedicated CONFIG for private memory, and then have KVM check that for all
> the memfile stuff.  x86 can then select it only for 64-bit kernels, and in turn
> select MEMFILE_NOTIFIER iff private memory is supported.
> 
> diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
> index ca7b2a6a452a..ee9c8c155300 100644
> --- a/arch/x86/kvm/Kconfig
> +++ b/arch/x86/kvm/Kconfig
> @@ -48,7 +48,9 @@ config KVM
>         select SRCU
>         select INTERVAL_TREE
>         select HAVE_KVM_PM_NOTIFIER if PM
> -       select MEMFILE_NOTIFIER
> +       select HAVE_KVM_PRIVATE_MEM if X86_64
> +       select MEMFILE_NOTIFIER if HAVE_KVM_PRIVATE_MEM
> +
>         help
>           Support hosting fully virtualized guest machines using hardware
>           virtualization extensions.  You will need a fairly recent
> 
> And in addition to replacing checks on CONFIG_MEMFILE_NOTIFIER, the probing of
> whether or not KVM_MEM_PRIVATE is allowed can be:
> 
> @@ -1499,23 +1499,19 @@ static void kvm_replace_memslot(struct kvm *kvm,
>         }
>  }
> 
> -bool __weak kvm_arch_private_memory_supported(struct kvm *kvm)
> -{
> -       return false;
> -}
> -
>  static int check_memory_region_flags(struct kvm *kvm,
>                                 const struct kvm_userspace_memory_region *mem)
>  {
>         u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
> 
> -       if (kvm_arch_private_memory_supported(kvm))
> -               valid_flags |= KVM_MEM_PRIVATE;
> -
>  #ifdef __KVM_HAVE_READONLY_MEM
>         valid_flags |= KVM_MEM_READONLY;
>  #endif
> 
> +#ifdef CONFIG_KVM_HAVE_PRIVATE_MEM
> +       valid_flags |= KVM_MEM_PRIVATE;
> +#endif

One thing to mention is CONFIG_KVM_HAVE_PRIVATE_MEM is build-time thing.
Do you think we should or not do that for runtime? E.g. expose by vm_type
so only when TDX is enabled KVM_MEM_PRIVATE is exposed.

Chao
diff mbox series

Patch

diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index e3cbd7706136..ca7b2a6a452a 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -48,6 +48,7 @@  config KVM
 	select SRCU
 	select INTERVAL_TREE
 	select HAVE_KVM_PM_NOTIFIER if PM
+	select MEMFILE_NOTIFIER
 	help
 	  Support hosting fully virtualized guest machines using hardware
 	  virtualization extensions.  You will need a fairly recent
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index c92c70174248..6e1d770d6bf8 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -44,6 +44,7 @@ 
 
 #include <asm/kvm_host.h>
 #include <linux/kvm_dirty_ring.h>
+#include <linux/memfile_notifier.h>
 
 #ifndef KVM_MAX_VCPU_IDS
 #define KVM_MAX_VCPU_IDS KVM_MAX_VCPUS
@@ -565,6 +566,7 @@  struct kvm_memory_slot {
 	u16 as_id;
 	struct file *private_file;
 	loff_t private_offset;
+	struct memfile_pfn_ops *pfn_ops;
 };
 
 static inline bool kvm_slot_is_private(const struct kvm_memory_slot *slot)
@@ -915,6 +917,7 @@  static inline void kvm_irqfd_exit(void)
 {
 }
 #endif
+
 int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
 		  struct module *module);
 void kvm_exit(void);
@@ -2217,4 +2220,34 @@  static inline void kvm_handle_signal_exit(struct kvm_vcpu *vcpu)
 /* Max number of entries allowed for each kvm dirty ring */
 #define  KVM_DIRTY_RING_MAX_ENTRIES  65536
 
+#ifdef CONFIG_MEMFILE_NOTIFIER
+static inline long kvm_memfile_get_pfn(struct kvm_memory_slot *slot, gfn_t gfn,
+				       int *order)
+{
+	pgoff_t index = gfn - slot->base_gfn +
+			(slot->private_offset >> PAGE_SHIFT);
+
+	return slot->pfn_ops->get_lock_pfn(file_inode(slot->private_file),
+					   index, order);
+}
+
+static inline void kvm_memfile_put_pfn(struct kvm_memory_slot *slot,
+				       kvm_pfn_t pfn)
+{
+	slot->pfn_ops->put_unlock_pfn(pfn);
+}
+
+#else
+static inline long kvm_memfile_get_pfn(struct kvm_memory_slot *slot, gfn_t gfn,
+				       int *order)
+{
+	return -1;
+}
+
+static inline void kvm_memfile_put_pfn(struct kvm_memory_slot *slot,
+				       kvm_pfn_t pfn)
+{
+}
+#endif /* CONFIG_MEMFILE_NOTIFIER */
+
 #endif