Message ID | 20240417153450.3608097-3-pbonzini@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | KVM: Guest Memory Pre-Population API | expand |
On Wed, Apr 17, 2024 at 11:34:45AM -0400, Paolo Bonzini <pbonzini@redhat.com> wrote: > From: Isaku Yamahata <isaku.yamahata@intel.com> > > Add a new ioctl KVM_MAP_MEMORY in the KVM common code. It iterates on the > memory range and calls the arch-specific function. Add stub arch function > as a weak symbol. > > Suggested-by: Sean Christopherson <seanjc@google.com> > Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com> > Reviewed-by: Rick Edgecombe <rick.p.edgecombe@intel.com> > Message-ID: <819322b8f25971f2b9933bfa4506e618508ad782.1712785629.git.isaku.yamahata@intel.com> > Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> > --- > include/linux/kvm_host.h | 5 ++++ > include/uapi/linux/kvm.h | 10 +++++++ > virt/kvm/Kconfig | 3 ++ > virt/kvm/kvm_main.c | 61 ++++++++++++++++++++++++++++++++++++++++ > 4 files changed, 79 insertions(+) > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > index 8dea11701ab2..2b0f0240a64c 100644 > --- a/include/linux/kvm_host.h > +++ b/include/linux/kvm_host.h > @@ -2478,4 +2478,9 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t gfn, void __user *src, long npages > void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end); > #endif > > +#ifdef CONFIG_KVM_GENERIC_MAP_MEMORY > +int kvm_arch_vcpu_map_memory(struct kvm_vcpu *vcpu, > + struct kvm_map_memory *mapping); > +#endif > + > #endif > diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h > index 2190adbe3002..4d233f44c613 100644 > --- a/include/uapi/linux/kvm.h > +++ b/include/uapi/linux/kvm.h > @@ -917,6 +917,7 @@ struct kvm_enable_cap { > #define KVM_CAP_MEMORY_ATTRIBUTES 233 > #define KVM_CAP_GUEST_MEMFD 234 > #define KVM_CAP_VM_TYPES 235 > +#define KVM_CAP_MAP_MEMORY 236 > > struct kvm_irq_routing_irqchip { > __u32 irqchip; > @@ -1548,4 +1549,13 @@ struct kvm_create_guest_memfd { > __u64 reserved[6]; > }; > > +#define KVM_MAP_MEMORY _IOWR(KVMIO, 0xd5, struct kvm_map_memory) > + > +struct kvm_map_memory { > + __u64 base_address; > + __u64 size; > + __u64 flags; > + __u64 padding[5]; > +}; > + > #endif /* __LINUX_KVM_H */ > diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig > index 754c6c923427..1b94126622e8 100644 > --- a/virt/kvm/Kconfig > +++ b/virt/kvm/Kconfig > @@ -67,6 +67,9 @@ config HAVE_KVM_INVALID_WAKEUPS > config KVM_GENERIC_DIRTYLOG_READ_PROTECT > bool > > +config KVM_GENERIC_MAP_MEMORY > + bool > + > config KVM_COMPAT > def_bool y > depends on KVM && COMPAT && !(S390 || ARM64 || RISCV) > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > index 38b498669ef9..350ead98e9a6 100644 > --- a/virt/kvm/kvm_main.c > +++ b/virt/kvm/kvm_main.c > @@ -4379,6 +4379,47 @@ static int kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu *vcpu) > return fd; > } > > +#ifdef CONFIG_KVM_GENERIC_MAP_MEMORY > +static int kvm_vcpu_map_memory(struct kvm_vcpu *vcpu, > + struct kvm_map_memory *mapping) > +{ > + int idx, r; > + u64 full_size; > + > + if (mapping->flags) > + return -EINVAL; > + > + if (!PAGE_ALIGNED(mapping->base_address) || > + !PAGE_ALIGNED(mapping->size) || > + mapping->base_address + mapping->size <= mapping->base_address) > + return -EINVAL; > + > + vcpu_load(vcpu); > + idx = srcu_read_lock(&vcpu->kvm->srcu); > + > + r = 0; > + full_size = mapping->size; > + while (mapping->size) { > + if (signal_pending(current)) { > + r = -EINTR; > + break; > + } > + > + r = kvm_arch_vcpu_map_memory(vcpu, mapping); > + if (r) > + break; > + > + cond_resched(); > + } > + > + srcu_read_unlock(&vcpu->kvm->srcu, idx); > + vcpu_put(vcpu); > + > + /* Return success if at least one page was mapped successfully. */ > + return full_size == mapping->size ? r : 0; > +} > +#endif > + > static long kvm_vcpu_ioctl(struct file *filp, > unsigned int ioctl, unsigned long arg) > { > @@ -4580,6 +4621,20 @@ static long kvm_vcpu_ioctl(struct file *filp, > r = kvm_vcpu_ioctl_get_stats_fd(vcpu); > break; > } > +#ifdef CONFIG_KVM_GENERIC_MAP_MEMORY > + case KVM_MAP_MEMORY: { > + struct kvm_map_memory mapping; > + > + r = -EFAULT; > + if (copy_from_user(&mapping, argp, sizeof(mapping))) > + break; > + r = kvm_vcpu_map_memory(vcpu, &mapping); > + /* Pass back leftover range. */ > + if (copy_to_user(argp, &mapping, sizeof(mapping))) > + r = -EFAULT; > + break; > + } > +#endif > default: > r = kvm_arch_vcpu_ioctl(filp, ioctl, arg); > } > @@ -4863,6 +4918,12 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) > #ifdef CONFIG_KVM_PRIVATE_MEM > case KVM_CAP_GUEST_MEMFD: > return !kvm || kvm_arch_has_private_mem(kvm); > +#endif > +#ifdef CONFIG_KVM_GENERIC_MAP_MEMORY > + case KVM_CAP_MAP_MEMORY: > + if (!kvm) > + return 1; > + /* Leave per-VM implementation to kvm_vm_ioctl_check_extension(). */ nitpick: fallthough; > #endif > default: > break; > -- > 2.43.0 > > >
On Wed, Apr 17, 2024, Isaku Yamahata wrote: > > + vcpu_load(vcpu); > > + idx = srcu_read_lock(&vcpu->kvm->srcu); > > + > > + r = 0; > > + full_size = mapping->size; > > + while (mapping->size) { Maybe pre-check !mapping->size? E.g. there's no reason to load the vCPU and acquire SRCU just to do nothing. Then this can be a do-while loop and doesn't need to explicitly initialize 'r'. > > + if (signal_pending(current)) { > > + r = -EINTR; > > + break; > > + } > > + > > + r = kvm_arch_vcpu_map_memory(vcpu, mapping); Requiring arch code to address @mapping is cumbersone. If the arch call returns a long, then can't we do? if (r < 0) break; mapping->size -= r; mapping->gpa += r; > > + if (r) > > + break; > > + > > + cond_resched(); > > + } > > + > > + srcu_read_unlock(&vcpu->kvm->srcu, idx); > > + vcpu_put(vcpu); > > + > > + /* Return success if at least one page was mapped successfully. */ > > + return full_size == mapping->size ? r : 0; I just saw Paolo's update that this is intentional, but this strikes me as odd, as it requires userspace to redo the ioctl() to figure out why the last one failed. Not a sticking point, just odd to my eyes.
On Wed, Apr 17, 2024 at 11:07 PM Sean Christopherson <seanjc@google.com> wrote: > > On Wed, Apr 17, 2024, Isaku Yamahata wrote: > > > + vcpu_load(vcpu); > > > + idx = srcu_read_lock(&vcpu->kvm->srcu); > > > + > > > + r = 0; > > > + full_size = mapping->size; > > > + while (mapping->size) { > > Maybe pre-check !mapping->size? E.g. there's no reason to load the vCPU and > acquire SRCU just to do nothing. Then this can be a do-while loop and doesn't > need to explicitly initialize 'r'. It's unlikely to make any difference but okay---easy enough. > > > + if (signal_pending(current)) { > > > + r = -EINTR; > > > + break; > > > + } > > > + > > > + r = kvm_arch_vcpu_map_memory(vcpu, mapping); > > Requiring arch code to address @mapping is cumbersone. If the arch call returns > a long, then can't we do? > > if (r < 0) > break; > > mapping->size -= r; > mapping->gpa += r; Ok, I thought the same for the return value. I didn't expand the arguments to arch code in case in the future we have flags or other expansions of the struct. > > > + if (r) > > > + break; > > > + > > > + cond_resched(); > > > + } > > > + > > > + srcu_read_unlock(&vcpu->kvm->srcu, idx); > > > + vcpu_put(vcpu); > > > + > > > + /* Return success if at least one page was mapped successfully. */ > > > + return full_size == mapping->size ? r : 0; > > I just saw Paolo's update that this is intentional, but this strikes me as odd, > as it requires userspace to redo the ioctl() to figure out why the last one failed. Yeah, the same is true of read() but I don't think it's a problem. If we get an EINTR, then (unlike KVM_RUN which can change the signal mask) the signal will be delivered right after the ioctl() returns and you can just proceed. For EAGAIN you can just proceed in general. And of course, if RET_PF_RETRY is handled in the kernel then EAGAIN goes away and the only cause of premature exit can be EINTR. Paolo > Not a sticking point, just odd to my eyes. >
> > +#ifdef CONFIG_KVM_GENERIC_MAP_MEMORY > > + case KVM_CAP_MAP_MEMORY: > > + if (!kvm) > > + return 1; > > + /* Leave per-VM implementation to kvm_vm_ioctl_check_extension(). */ > > nitpick: > fallthough; A little tricky. 'break;' should be more straightforward. Thanks, Yilun > > > #endif > > default: > > break; > > -- > > 2.43.0 > > > > > > > > -- > Isaku Yamahata <isaku.yamahata@intel.com> >
On Wed, Apr 17, 2024 at 11:34:45AM -0400, Paolo Bonzini wrote: > From: Isaku Yamahata <isaku.yamahata@intel.com> > > Add a new ioctl KVM_MAP_MEMORY in the KVM common code. It iterates on the > memory range and calls the arch-specific function. Add stub arch function > as a weak symbol. The weak symbol is gone. Thanks, Yilun
On Fri, Apr 19, 2024, Xu Yilun wrote: > > > +#ifdef CONFIG_KVM_GENERIC_MAP_MEMORY > > > + case KVM_CAP_MAP_MEMORY: > > > + if (!kvm) > > > + return 1; > > > + /* Leave per-VM implementation to kvm_vm_ioctl_check_extension(). */ > > > > nitpick: > > fallthough; > > A little tricky. 'break;' should be more straightforward. +1, though it's a moot point as Paolo dropped this code in v4.
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 8dea11701ab2..2b0f0240a64c 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2478,4 +2478,9 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t gfn, void __user *src, long npages void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end); #endif +#ifdef CONFIG_KVM_GENERIC_MAP_MEMORY +int kvm_arch_vcpu_map_memory(struct kvm_vcpu *vcpu, + struct kvm_map_memory *mapping); +#endif + #endif diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 2190adbe3002..4d233f44c613 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -917,6 +917,7 @@ struct kvm_enable_cap { #define KVM_CAP_MEMORY_ATTRIBUTES 233 #define KVM_CAP_GUEST_MEMFD 234 #define KVM_CAP_VM_TYPES 235 +#define KVM_CAP_MAP_MEMORY 236 struct kvm_irq_routing_irqchip { __u32 irqchip; @@ -1548,4 +1549,13 @@ struct kvm_create_guest_memfd { __u64 reserved[6]; }; +#define KVM_MAP_MEMORY _IOWR(KVMIO, 0xd5, struct kvm_map_memory) + +struct kvm_map_memory { + __u64 base_address; + __u64 size; + __u64 flags; + __u64 padding[5]; +}; + #endif /* __LINUX_KVM_H */ diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index 754c6c923427..1b94126622e8 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -67,6 +67,9 @@ config HAVE_KVM_INVALID_WAKEUPS config KVM_GENERIC_DIRTYLOG_READ_PROTECT bool +config KVM_GENERIC_MAP_MEMORY + bool + config KVM_COMPAT def_bool y depends on KVM && COMPAT && !(S390 || ARM64 || RISCV) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 38b498669ef9..350ead98e9a6 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -4379,6 +4379,47 @@ static int kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu *vcpu) return fd; } +#ifdef CONFIG_KVM_GENERIC_MAP_MEMORY +static int kvm_vcpu_map_memory(struct kvm_vcpu *vcpu, + struct kvm_map_memory *mapping) +{ + int idx, r; + u64 full_size; + + if (mapping->flags) + return -EINVAL; + + if (!PAGE_ALIGNED(mapping->base_address) || + !PAGE_ALIGNED(mapping->size) || + mapping->base_address + mapping->size <= mapping->base_address) + return -EINVAL; + + vcpu_load(vcpu); + idx = srcu_read_lock(&vcpu->kvm->srcu); + + r = 0; + full_size = mapping->size; + while (mapping->size) { + if (signal_pending(current)) { + r = -EINTR; + break; + } + + r = kvm_arch_vcpu_map_memory(vcpu, mapping); + if (r) + break; + + cond_resched(); + } + + srcu_read_unlock(&vcpu->kvm->srcu, idx); + vcpu_put(vcpu); + + /* Return success if at least one page was mapped successfully. */ + return full_size == mapping->size ? r : 0; +} +#endif + static long kvm_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -4580,6 +4621,20 @@ static long kvm_vcpu_ioctl(struct file *filp, r = kvm_vcpu_ioctl_get_stats_fd(vcpu); break; } +#ifdef CONFIG_KVM_GENERIC_MAP_MEMORY + case KVM_MAP_MEMORY: { + struct kvm_map_memory mapping; + + r = -EFAULT; + if (copy_from_user(&mapping, argp, sizeof(mapping))) + break; + r = kvm_vcpu_map_memory(vcpu, &mapping); + /* Pass back leftover range. */ + if (copy_to_user(argp, &mapping, sizeof(mapping))) + r = -EFAULT; + break; + } +#endif default: r = kvm_arch_vcpu_ioctl(filp, ioctl, arg); } @@ -4863,6 +4918,12 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) #ifdef CONFIG_KVM_PRIVATE_MEM case KVM_CAP_GUEST_MEMFD: return !kvm || kvm_arch_has_private_mem(kvm); +#endif +#ifdef CONFIG_KVM_GENERIC_MAP_MEMORY + case KVM_CAP_MAP_MEMORY: + if (!kvm) + return 1; + /* Leave per-VM implementation to kvm_vm_ioctl_check_extension(). */ #endif default: break;