Message ID | 20220706082016.2603916-10-chao.p.peng@linux.intel.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | KVM: mm: fd-based approach for supporting KVM guest private memory | expand |
On Wed, Jul 06, 2022, Chao Peng wrote: > @@ -1332,9 +1332,18 @@ yet and must be cleared on entry. > __u64 userspace_addr; /* start of the userspace allocated memory */ > }; > > + struct kvm_userspace_memory_region_ext { > + struct kvm_userspace_memory_region region; > + __u64 private_offset; > + __u32 private_fd; > + __u32 pad1; > + __u64 pad2[14]; > +}; > + > /* for kvm_memory_region::flags */ > #define KVM_MEM_LOG_DIRTY_PAGES (1UL << 0) > #define KVM_MEM_READONLY (1UL << 1) > + #define KVM_MEM_PRIVATE (1UL << 2) Very belatedly following up on prior feedback... | I think a flag is still needed, the problem is private_fd can be safely | accessed only when this flag is set, e.g. without this flag, we can't | copy_from_user these new fields since they don't exist for previous | kvm_userspace_memory_region callers. I forgot about that aspect of things. We don't technically need a dedicated PRIVATE flag to handle that, but it does seem to be the least awful soltuion. We could either add a generic KVM_MEM_EXTENDED_REGION or an entirely new ioctl(), e.g. KVM_SET_USER_MEMORY_REGION2, but in both approaches there's a decent chance that we'll end up needed individual "this field is valid" flags anways. E.g. if KVM requires pad1 and pad2 to be zero to carve out future extensions, then we're right back here if some future extension needs to treat '0' as a legal input. TL;DR: adding KVM_MEM_PRIVATE still seems like the best approach. > @@ -4631,14 +4658,35 @@ static long kvm_vm_ioctl(struct file *filp, > break; > } > case KVM_SET_USER_MEMORY_REGION: { > - struct kvm_userspace_memory_region kvm_userspace_mem; > + struct kvm_user_mem_region mem; > + unsigned long size; > + u32 flags; > + > + kvm_sanity_check_user_mem_region_alias(); > + > + memset(&mem, 0, sizeof(mem)); > > r = -EFAULT; > - if (copy_from_user(&kvm_userspace_mem, argp, > - sizeof(kvm_userspace_mem))) > + > + if (get_user(flags, > + (u32 __user *)(argp + offsetof(typeof(mem), flags)))) > + goto out; Indentation is funky. It's hard to massage this into something short and readable What about capturing the offset separately? E.g. struct kvm_user_mem_region mem; unsigned int flags_offset = offsetof(typeof(mem), flags)); unsigned long size; u32 flags; kvm_sanity_check_user_mem_region_alias(); memset(&mem, 0, sizeof(mem)); r = -EFAULT; if (get_user(flags, (u32 __user *)(argp + flags_offset))) goto out; But this can actually be punted until KVM_MEM_PRIVATE is fully supported. As of this patch, KVM doesn't read the extended size, so I believe the diff for this patch can simply be: diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index da263c370d00..5194beb7b52f 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -4640,6 +4640,10 @@ static long kvm_vm_ioctl(struct file *filp, sizeof(kvm_userspace_mem))) goto out; + r = -EINVAL; + if (mem.flags & KVM_MEM_PRIVATE) + goto out; + r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem); break; }
On Fri, Jul 29, 2022 at 07:51:29PM +0000, Sean Christopherson wrote: > On Wed, Jul 06, 2022, Chao Peng wrote: > > @@ -1332,9 +1332,18 @@ yet and must be cleared on entry. > > __u64 userspace_addr; /* start of the userspace allocated memory */ > > }; > > > > + struct kvm_userspace_memory_region_ext { > > + struct kvm_userspace_memory_region region; > > + __u64 private_offset; > > + __u32 private_fd; > > + __u32 pad1; > > + __u64 pad2[14]; > > +}; > > + > > /* for kvm_memory_region::flags */ > > #define KVM_MEM_LOG_DIRTY_PAGES (1UL << 0) > > #define KVM_MEM_READONLY (1UL << 1) > > + #define KVM_MEM_PRIVATE (1UL << 2) > > Very belatedly following up on prior feedback... > > | I think a flag is still needed, the problem is private_fd can be safely > | accessed only when this flag is set, e.g. without this flag, we can't > | copy_from_user these new fields since they don't exist for previous > | kvm_userspace_memory_region callers. > > I forgot about that aspect of things. We don't technically need a dedicated > PRIVATE flag to handle that, but it does seem to be the least awful soltuion. > We could either add a generic KVM_MEM_EXTENDED_REGION or an entirely new > ioctl(), e.g. KVM_SET_USER_MEMORY_REGION2, but in both approaches there's a decent > chance that we'll end up needed individual "this field is valid" flags anways. > > E.g. if KVM requires pad1 and pad2 to be zero to carve out future extensions, > then we're right back here if some future extension needs to treat '0' as a legal > input. I had such practice (always rejecting none-zero 'pad' value when introducing new user APIs) in other project previously, but I rarely see that in KVM. > > TL;DR: adding KVM_MEM_PRIVATE still seems like the best approach. > > > @@ -4631,14 +4658,35 @@ static long kvm_vm_ioctl(struct file *filp, > > break; > > } > > case KVM_SET_USER_MEMORY_REGION: { > > - struct kvm_userspace_memory_region kvm_userspace_mem; > > + struct kvm_user_mem_region mem; > > + unsigned long size; > > + u32 flags; > > + > > + kvm_sanity_check_user_mem_region_alias(); > > + > > + memset(&mem, 0, sizeof(mem)); > > > > r = -EFAULT; > > - if (copy_from_user(&kvm_userspace_mem, argp, > > - sizeof(kvm_userspace_mem))) > > + > > + if (get_user(flags, > > + (u32 __user *)(argp + offsetof(typeof(mem), flags)))) > > + goto out; > > > Indentation is funky. It's hard to massage this into something short and > readable What about capturing the offset separately? E.g. > > struct kvm_user_mem_region mem; > unsigned int flags_offset = offsetof(typeof(mem), flags)); > unsigned long size; > u32 flags; > > kvm_sanity_check_user_mem_region_alias(); > > memset(&mem, 0, sizeof(mem)); > > r = -EFAULT; > if (get_user(flags, (u32 __user *)(argp + flags_offset))) > goto out; > > But this can actually be punted until KVM_MEM_PRIVATE is fully supported. As of > this patch, KVM doesn't read the extended size, so I believe the diff for this > patch can simply be: Looks good to me, Thanks. Chao > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > index da263c370d00..5194beb7b52f 100644 > --- a/virt/kvm/kvm_main.c > +++ b/virt/kvm/kvm_main.c > @@ -4640,6 +4640,10 @@ static long kvm_vm_ioctl(struct file *filp, > sizeof(kvm_userspace_mem))) > goto out; > > + r = -EINVAL; > + if (mem.flags & KVM_MEM_PRIVATE) > + goto out; > + > r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem); > break; > }
On Wed, Aug 03, 2022, Chao Peng wrote: > On Fri, Jul 29, 2022 at 07:51:29PM +0000, Sean Christopherson wrote: > > On Wed, Jul 06, 2022, Chao Peng wrote: > > > @@ -1332,9 +1332,18 @@ yet and must be cleared on entry. > > > __u64 userspace_addr; /* start of the userspace allocated memory */ > > > }; > > > > > > + struct kvm_userspace_memory_region_ext { > > > + struct kvm_userspace_memory_region region; > > > + __u64 private_offset; > > > + __u32 private_fd; > > > + __u32 pad1; > > > + __u64 pad2[14]; > > > +}; > > > + > > > /* for kvm_memory_region::flags */ > > > #define KVM_MEM_LOG_DIRTY_PAGES (1UL << 0) > > > #define KVM_MEM_READONLY (1UL << 1) > > > + #define KVM_MEM_PRIVATE (1UL << 2) > > > > Very belatedly following up on prior feedback... > > > > | I think a flag is still needed, the problem is private_fd can be safely > > | accessed only when this flag is set, e.g. without this flag, we can't > > | copy_from_user these new fields since they don't exist for previous > > | kvm_userspace_memory_region callers. > > > > I forgot about that aspect of things. We don't technically need a dedicated > > PRIVATE flag to handle that, but it does seem to be the least awful soltuion. > > We could either add a generic KVM_MEM_EXTENDED_REGION or an entirely new > > ioctl(), e.g. KVM_SET_USER_MEMORY_REGION2, but in both approaches there's a decent > > chance that we'll end up needed individual "this field is valid" flags anways. > > > > E.g. if KVM requires pad1 and pad2 to be zero to carve out future extensions, > > then we're right back here if some future extension needs to treat '0' as a legal > > input. > > I had such practice (always rejecting none-zero 'pad' value when > introducing new user APIs) in other project previously, but I rarely > see that in KVM. Ya, KVM often uses flags to indicate the validity of a field specifically so that KVM doesn't misinterpret a '0' from an older userspace as an intended value.
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index bafaeedd455c..4f27c973a952 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -1319,7 +1319,7 @@ yet and must be cleared on entry. :Capability: KVM_CAP_USER_MEMORY :Architectures: all :Type: vm ioctl -:Parameters: struct kvm_userspace_memory_region (in) +:Parameters: struct kvm_userspace_memory_region(_ext) (in) :Returns: 0 on success, -1 on error :: @@ -1332,9 +1332,18 @@ yet and must be cleared on entry. __u64 userspace_addr; /* start of the userspace allocated memory */ }; + struct kvm_userspace_memory_region_ext { + struct kvm_userspace_memory_region region; + __u64 private_offset; + __u32 private_fd; + __u32 pad1; + __u64 pad2[14]; +}; + /* for kvm_memory_region::flags */ #define KVM_MEM_LOG_DIRTY_PAGES (1UL << 0) #define KVM_MEM_READONLY (1UL << 1) + #define KVM_MEM_PRIVATE (1UL << 2) This ioctl allows the user to create, modify or delete a guest physical memory slot. Bits 0-15 of "slot" specify the slot id and this value @@ -1365,12 +1374,27 @@ It is recommended that the lower 21 bits of guest_phys_addr and userspace_addr be identical. This allows large pages in the guest to be backed by large pages in the host. -The flags field supports two flags: KVM_MEM_LOG_DIRTY_PAGES and -KVM_MEM_READONLY. The former can be set to instruct KVM to keep track of -writes to memory within the slot. See KVM_GET_DIRTY_LOG ioctl to know how to -use it. The latter can be set, if KVM_CAP_READONLY_MEM capability allows it, -to make a new slot read-only. In this case, writes to this memory will be -posted to userspace as KVM_EXIT_MMIO exits. +kvm_userspace_memory_region_ext includes all the kvm_userspace_memory_region +fields. It also includes additional fields for some specific features. See +below description of flags field for more information. It's recommended to use +kvm_userspace_memory_region_ext in new userspace code. + +The flags field supports below flags: + +- KVM_MEM_LOG_DIRTY_PAGES can be set to instruct KVM to keep track of writes to + memory within the slot. See KVM_GET_DIRTY_LOG ioctl to know how to use it. + +- KVM_MEM_READONLY can be set, if KVM_CAP_READONLY_MEM capability allows it, to + make a new slot read-only. In this case, writes to this memory will be posted + to userspace as KVM_EXIT_MMIO exits. + +- KVM_MEM_PRIVATE can be set to indicate a new slot has private memory backed by + a file descirptor(fd) and the content of the private memory is invisible to + userspace. In this case, userspace should use private_fd/private_offset in + kvm_userspace_memory_region_ext to instruct KVM to provide private memory to + guest. Userspace should guarantee not to map the same pfn indicated by + private_fd/private_offset to different gfns with multiple memslots. Failed to + do this may result undefined behavior. When the KVM_CAP_SYNC_MMU capability is available, changes in the backing of the memory region are automatically reflected into the guest. For example, an diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index e3cbd7706136..1f160801e2a7 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -48,6 +48,8 @@ config KVM select SRCU select INTERVAL_TREE select HAVE_KVM_PM_NOTIFIER if PM + select HAVE_KVM_PRIVATE_MEM if X86_64 + select MEMFILE_NOTIFIER if HAVE_KVM_PRIVATE_MEM help Support hosting fully virtualized guest machines using hardware virtualization extensions. You will need a fairly recent diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 567d13405445..77d16b90045c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -12154,7 +12154,7 @@ void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, } for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { - struct kvm_userspace_memory_region m; + struct kvm_user_mem_region m; m.slot = id | (i << 16); m.flags = 0; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index c262ebb168a7..1b203c8aa696 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -44,6 +44,7 @@ #include <asm/kvm_host.h> #include <linux/kvm_dirty_ring.h> +#include <linux/memfile_notifier.h> #ifndef KVM_MAX_VCPU_IDS #define KVM_MAX_VCPU_IDS KVM_MAX_VCPUS @@ -576,8 +577,16 @@ struct kvm_memory_slot { u32 flags; short id; u16 as_id; + struct file *private_file; + loff_t private_offset; + struct memfile_notifier notifier; }; +static inline bool kvm_slot_can_be_private(const struct kvm_memory_slot *slot) +{ + return slot && (slot->flags & KVM_MEM_PRIVATE); +} + static inline bool kvm_slot_dirty_track_enabled(const struct kvm_memory_slot *slot) { return slot->flags & KVM_MEM_LOG_DIRTY_PAGES; @@ -1109,9 +1118,9 @@ enum kvm_mr_change { }; int kvm_set_memory_region(struct kvm *kvm, - const struct kvm_userspace_memory_region *mem); + const struct kvm_user_mem_region *mem); int __kvm_set_memory_region(struct kvm *kvm, - const struct kvm_userspace_memory_region *mem); + const struct kvm_user_mem_region *mem); void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot); void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen); int kvm_arch_prepare_memory_region(struct kvm *kvm, diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index a36e78710382..c467c69b7ad7 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -103,6 +103,33 @@ struct kvm_userspace_memory_region { __u64 userspace_addr; /* start of the userspace allocated memory */ }; +struct kvm_userspace_memory_region_ext { + struct kvm_userspace_memory_region region; + __u64 private_offset; + __u32 private_fd; + __u32 pad1; + __u64 pad2[14]; +}; + +#ifdef __KERNEL__ +/* + * kvm_user_mem_region is a kernel-only alias of kvm_userspace_memory_region_ext + * that "unpacks" kvm_userspace_memory_region so that KVM can directly access + * all fields from the top-level "extended" region. + */ +struct kvm_user_mem_region { + __u32 slot; + __u32 flags; + __u64 guest_phys_addr; + __u64 memory_size; + __u64 userspace_addr; + __u64 private_offset; + __u32 private_fd; + __u32 pad1; + __u64 pad2[14]; +}; +#endif + /* * The bit 0 ~ bit 15 of kvm_memory_region::flags are visible for userspace, * other bits are reserved for kvm internal use which are defined in @@ -110,6 +137,7 @@ struct kvm_userspace_memory_region { */ #define KVM_MEM_LOG_DIRTY_PAGES (1UL << 0) #define KVM_MEM_READONLY (1UL << 1) +#define KVM_MEM_PRIVATE (1UL << 2) /* for KVM_IRQ_LINE */ struct kvm_irq_level { diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index a8c5c9f06b3c..ccaff13cc5b8 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -72,3 +72,6 @@ config KVM_XFER_TO_GUEST_WORK config HAVE_KVM_PM_NOTIFIER bool + +config HAVE_KVM_PRIVATE_MEM + bool diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 3ae4944b9f15..230c8ff9659c 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1508,7 +1508,7 @@ static void kvm_replace_memslot(struct kvm *kvm, } } -static int check_memory_region_flags(const struct kvm_userspace_memory_region *mem) +static int check_memory_region_flags(const struct kvm_user_mem_region *mem) { u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES; @@ -1902,7 +1902,7 @@ static bool kvm_check_memslot_overlap(struct kvm_memslots *slots, int id, * Must be called holding kvm->slots_lock for write. */ int __kvm_set_memory_region(struct kvm *kvm, - const struct kvm_userspace_memory_region *mem) + const struct kvm_user_mem_region *mem) { struct kvm_memory_slot *old, *new; struct kvm_memslots *slots; @@ -2006,7 +2006,7 @@ int __kvm_set_memory_region(struct kvm *kvm, EXPORT_SYMBOL_GPL(__kvm_set_memory_region); int kvm_set_memory_region(struct kvm *kvm, - const struct kvm_userspace_memory_region *mem) + const struct kvm_user_mem_region *mem) { int r; @@ -2018,7 +2018,7 @@ int kvm_set_memory_region(struct kvm *kvm, EXPORT_SYMBOL_GPL(kvm_set_memory_region); static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem) + struct kvm_user_mem_region *mem) { if ((u16)mem->slot >= KVM_USER_MEM_SLOTS) return -EINVAL; @@ -4608,6 +4608,33 @@ static int kvm_vm_ioctl_get_stats_fd(struct kvm *kvm) return fd; } +#define SANITY_CHECK_MEM_REGION_FIELD(field) \ +do { \ + BUILD_BUG_ON(offsetof(struct kvm_user_mem_region, field) != \ + offsetof(struct kvm_userspace_memory_region, field)); \ + BUILD_BUG_ON(sizeof_field(struct kvm_user_mem_region, field) != \ + sizeof_field(struct kvm_userspace_memory_region, field)); \ +} while (0) + +#define SANITY_CHECK_MEM_REGION_EXT_FIELD(field) \ +do { \ + BUILD_BUG_ON(offsetof(struct kvm_user_mem_region, field) != \ + offsetof(struct kvm_userspace_memory_region_ext, field)); \ + BUILD_BUG_ON(sizeof_field(struct kvm_user_mem_region, field) != \ + sizeof_field(struct kvm_userspace_memory_region_ext, field)); \ +} while (0) + +static void kvm_sanity_check_user_mem_region_alias(void) +{ + SANITY_CHECK_MEM_REGION_FIELD(slot); + SANITY_CHECK_MEM_REGION_FIELD(flags); + SANITY_CHECK_MEM_REGION_FIELD(guest_phys_addr); + SANITY_CHECK_MEM_REGION_FIELD(memory_size); + SANITY_CHECK_MEM_REGION_FIELD(userspace_addr); + SANITY_CHECK_MEM_REGION_EXT_FIELD(private_offset); + SANITY_CHECK_MEM_REGION_EXT_FIELD(private_fd); +} + static long kvm_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -4631,14 +4658,35 @@ static long kvm_vm_ioctl(struct file *filp, break; } case KVM_SET_USER_MEMORY_REGION: { - struct kvm_userspace_memory_region kvm_userspace_mem; + struct kvm_user_mem_region mem; + unsigned long size; + u32 flags; + + kvm_sanity_check_user_mem_region_alias(); + + memset(&mem, 0, sizeof(mem)); r = -EFAULT; - if (copy_from_user(&kvm_userspace_mem, argp, - sizeof(kvm_userspace_mem))) + + if (get_user(flags, + (u32 __user *)(argp + offsetof(typeof(mem), flags)))) + goto out; + + if (flags & KVM_MEM_PRIVATE) { + r = -EINVAL; + goto out; + } + + size = sizeof(struct kvm_userspace_memory_region); + + if (copy_from_user(&mem, argp, size)) + goto out; + + r = -EINVAL; + if ((flags ^ mem.flags) & KVM_MEM_PRIVATE) goto out; - r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem); + r = kvm_vm_ioctl_set_memory_region(kvm, &mem); break; } case KVM_GET_DIRTY_LOG: {