Message ID | 20240419085927.3648704-3-pbonzini@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | KVM: Guest Memory Pre-Population API | expand |
On 4/19/2024 4:59 PM, Paolo Bonzini wrote: > From: Isaku Yamahata <isaku.yamahata@intel.com> > > Add a new ioctl KVM_PRE_FAULT_MEMORY in the KVM common code. It iterates on the > memory range and calls the arch-specific function. Add stub arch function > as a weak symbol. > > Suggested-by: Sean Christopherson <seanjc@google.com> > Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com> > Reviewed-by: Rick Edgecombe <rick.p.edgecombe@intel.com> > Message-ID: <819322b8f25971f2b9933bfa4506e618508ad782.1712785629.git.isaku.yamahata@intel.com> > Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> > --- > include/linux/kvm_host.h | 5 ++++ > include/uapi/linux/kvm.h | 10 +++++++ > virt/kvm/Kconfig | 3 ++ > virt/kvm/kvm_main.c | 63 ++++++++++++++++++++++++++++++++++++++++ > 4 files changed, 81 insertions(+) > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > index 8dea11701ab2..9e9943e5e37c 100644 > --- a/include/linux/kvm_host.h > +++ b/include/linux/kvm_host.h > @@ -2478,4 +2478,9 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t gfn, void __user *src, long npages > void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end); > #endif > > +#ifdef CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY > +long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, > + struct kvm_pre_fault_memory *range); > +#endif > + > #endif > diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h > index 2190adbe3002..917d2964947d 100644 > --- a/include/uapi/linux/kvm.h > +++ b/include/uapi/linux/kvm.h > @@ -917,6 +917,7 @@ struct kvm_enable_cap { > #define KVM_CAP_MEMORY_ATTRIBUTES 233 > #define KVM_CAP_GUEST_MEMFD 234 > #define KVM_CAP_VM_TYPES 235 > +#define KVM_CAP_PRE_FAULT_MEMORY 236 > > struct kvm_irq_routing_irqchip { > __u32 irqchip; > @@ -1548,4 +1549,13 @@ struct kvm_create_guest_memfd { > __u64 reserved[6]; > }; > > +#define KVM_PRE_FAULT_MEMORY _IOWR(KVMIO, 0xd5, struct kvm_pre_fault_memory) > + > +struct kvm_pre_fault_memory { > + __u64 gpa; > + __u64 size; > + __u64 flags; > + __u64 padding[5]; > +}; > + > #endif /* __LINUX_KVM_H */ > diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig > index 754c6c923427..b14e14cdbfb9 100644 > --- a/virt/kvm/Kconfig > +++ b/virt/kvm/Kconfig > @@ -67,6 +67,9 @@ config HAVE_KVM_INVALID_WAKEUPS > config KVM_GENERIC_DIRTYLOG_READ_PROTECT > bool > > +config KVM_GENERIC_PRE_FAULT_MEMORY > + bool > + > config KVM_COMPAT > def_bool y > depends on KVM && COMPAT && !(S390 || ARM64 || RISCV) > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > index 38b498669ef9..51d8dbe7e93b 100644 > --- a/virt/kvm/kvm_main.c > +++ b/virt/kvm/kvm_main.c > @@ -4379,6 +4379,55 @@ static int kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu *vcpu) > return fd; > } > > +#ifdef CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY > +static int kvm_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, > + struct kvm_pre_fault_memory *range) > +{ > + int idx; > + long r; > + u64 full_size; > + > + if (range->flags) > + return -EINVAL; > + > + if (!PAGE_ALIGNED(range->gpa) || > + !PAGE_ALIGNED(range->size) || > + range->gpa + range->size <= range->gpa) > + return -EINVAL; > + > + if (!range->size) > + return 0; range->size equals 0 can be covered by "range->gpa + range->size <= range->gpa" If we want to return success when size is 0 (, though I am not sure it's needed), we need to use "range->gpa + range->size < range->gpa" instead. > + > + vcpu_load(vcpu); > + idx = srcu_read_lock(&vcpu->kvm->srcu); > + > + full_size = range->size; > + do { > + if (signal_pending(current)) { > + r = -EINTR; > + break; > + } > + > + r = kvm_arch_vcpu_pre_fault_memory(vcpu, range); > + if (r < 0) > + break; > + > + if (WARN_ON_ONCE(r == 0)) > + break; > + > + range->size -= r; > + range->gpa += r; > + cond_resched(); > + } while (range->size); > + > + srcu_read_unlock(&vcpu->kvm->srcu, idx); > + vcpu_put(vcpu); > + > + /* Return success if at least one page was mapped successfully. */ > + return full_size == range->size ? r : 0; > +} > +#endif > + > static long kvm_vcpu_ioctl(struct file *filp, > unsigned int ioctl, unsigned long arg) > { > @@ -4580,6 +4629,20 @@ static long kvm_vcpu_ioctl(struct file *filp, > r = kvm_vcpu_ioctl_get_stats_fd(vcpu); > break; > } > +#ifdef CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY > + case KVM_PRE_FAULT_MEMORY: { > + struct kvm_pre_fault_memory range; > + > + r = -EFAULT; > + if (copy_from_user(&range, argp, sizeof(range))) > + break; > + r = kvm_vcpu_pre_fault_memory(vcpu, &range); > + /* Pass back leftover range. */ > + if (copy_to_user(argp, &range, sizeof(range))) > + r = -EFAULT; > + break; > + } > +#endif > default: > r = kvm_arch_vcpu_ioctl(filp, ioctl, arg); > }
On 4/19/2024 4:59 PM, Paolo Bonzini wrote: > From: Isaku Yamahata <isaku.yamahata@intel.com> > > Add a new ioctl KVM_PRE_FAULT_MEMORY in the KVM common code. It iterates on the > memory range and calls the arch-specific function. Add stub arch function > as a weak symbol. The description is stale. The weak symbol was removed since v3.
On Fri, Apr 19, 2024 at 04:59:23AM -0400, Paolo Bonzini <pbonzini@redhat.com> wrote: > From: Isaku Yamahata <isaku.yamahata@intel.com> > > Add a new ioctl KVM_PRE_FAULT_MEMORY in the KVM common code. It iterates on the > memory range and calls the arch-specific function. Add stub arch function > as a weak symbol. > > Suggested-by: Sean Christopherson <seanjc@google.com> > Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com> > Reviewed-by: Rick Edgecombe <rick.p.edgecombe@intel.com> > Message-ID: <819322b8f25971f2b9933bfa4506e618508ad782.1712785629.git.isaku.yamahata@intel.com> > Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> > --- > include/linux/kvm_host.h | 5 ++++ > include/uapi/linux/kvm.h | 10 +++++++ > virt/kvm/Kconfig | 3 ++ > virt/kvm/kvm_main.c | 63 ++++++++++++++++++++++++++++++++++++++++ > 4 files changed, 81 insertions(+) > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > index 8dea11701ab2..9e9943e5e37c 100644 > --- a/include/linux/kvm_host.h > +++ b/include/linux/kvm_host.h > @@ -2478,4 +2478,9 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t gfn, void __user *src, long npages > void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end); > #endif > > +#ifdef CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY > +long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, > + struct kvm_pre_fault_memory *range); > +#endif > + > #endif > diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h > index 2190adbe3002..917d2964947d 100644 > --- a/include/uapi/linux/kvm.h > +++ b/include/uapi/linux/kvm.h > @@ -917,6 +917,7 @@ struct kvm_enable_cap { > #define KVM_CAP_MEMORY_ATTRIBUTES 233 > #define KVM_CAP_GUEST_MEMFD 234 > #define KVM_CAP_VM_TYPES 235 > +#define KVM_CAP_PRE_FAULT_MEMORY 236 > > struct kvm_irq_routing_irqchip { > __u32 irqchip; > @@ -1548,4 +1549,13 @@ struct kvm_create_guest_memfd { > __u64 reserved[6]; > }; > > +#define KVM_PRE_FAULT_MEMORY _IOWR(KVMIO, 0xd5, struct kvm_pre_fault_memory) > + > +struct kvm_pre_fault_memory { > + __u64 gpa; > + __u64 size; > + __u64 flags; > + __u64 padding[5]; > +}; > + > #endif /* __LINUX_KVM_H */ > diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig > index 754c6c923427..b14e14cdbfb9 100644 > --- a/virt/kvm/Kconfig > +++ b/virt/kvm/Kconfig > @@ -67,6 +67,9 @@ config HAVE_KVM_INVALID_WAKEUPS > config KVM_GENERIC_DIRTYLOG_READ_PROTECT > bool > > +config KVM_GENERIC_PRE_FAULT_MEMORY > + bool > + > config KVM_COMPAT > def_bool y > depends on KVM && COMPAT && !(S390 || ARM64 || RISCV) > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > index 38b498669ef9..51d8dbe7e93b 100644 > --- a/virt/kvm/kvm_main.c > +++ b/virt/kvm/kvm_main.c > @@ -4379,6 +4379,55 @@ static int kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu *vcpu) > return fd; > } > > +#ifdef CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY > +static int kvm_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, > + struct kvm_pre_fault_memory *range) > +{ > + int idx; > + long r; > + u64 full_size; > + > + if (range->flags) > + return -EINVAL; To keep future extensively, check the padding are zero. Or will we be rely on flags? if (!memchr_inv(range->padding, 0, sizeof(range->padding))) return -EINVAL;
On Mon, Apr 22, 2024 at 7:39 AM Binbin Wu <binbin.wu@linux.intel.com> wrote: > range->size equals 0 can be covered by "range->gpa + range->size <= > range->gpa" > > If we want to return success when size is 0 (, though I am not sure it's > needed), > we need to use "range->gpa + range->size < range->gpa" instead. I think it's not needed because it could cause an infinite loop in (buggy) userspace. Better return -EINVAL. Paolo > > > + > > + vcpu_load(vcpu); > > + idx = srcu_read_lock(&vcpu->kvm->srcu); > > + > > + full_size = range->size; > > + do { > > + if (signal_pending(current)) { > > + r = -EINTR; > > + break; > > + } > > + > > + r = kvm_arch_vcpu_pre_fault_memory(vcpu, range); > > + if (r < 0) > > + break; > > + > > + if (WARN_ON_ONCE(r == 0)) > > + break; > > + > > + range->size -= r; > > + range->gpa += r; > > + cond_resched(); > > + } while (range->size); > > + > > + srcu_read_unlock(&vcpu->kvm->srcu, idx); > > + vcpu_put(vcpu); > > + > > + /* Return success if at least one page was mapped successfully. */ > > + return full_size == range->size ? r : 0; > > +} > > +#endif > > + > > static long kvm_vcpu_ioctl(struct file *filp, > > unsigned int ioctl, unsigned long arg) > > { > > @@ -4580,6 +4629,20 @@ static long kvm_vcpu_ioctl(struct file *filp, > > r = kvm_vcpu_ioctl_get_stats_fd(vcpu); > > break; > > } > > +#ifdef CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY > > + case KVM_PRE_FAULT_MEMORY: { > > + struct kvm_pre_fault_memory range; > > + > > + r = -EFAULT; > > + if (copy_from_user(&range, argp, sizeof(range))) > > + break; > > + r = kvm_vcpu_pre_fault_memory(vcpu, &range); > > + /* Pass back leftover range. */ > > + if (copy_to_user(argp, &range, sizeof(range))) > > + r = -EFAULT; > > + break; > > + } > > +#endif > > default: > > r = kvm_arch_vcpu_ioctl(filp, ioctl, arg); > > } >
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 8dea11701ab2..9e9943e5e37c 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2478,4 +2478,9 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t gfn, void __user *src, long npages void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end); #endif +#ifdef CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY +long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, + struct kvm_pre_fault_memory *range); +#endif + #endif diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 2190adbe3002..917d2964947d 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -917,6 +917,7 @@ struct kvm_enable_cap { #define KVM_CAP_MEMORY_ATTRIBUTES 233 #define KVM_CAP_GUEST_MEMFD 234 #define KVM_CAP_VM_TYPES 235 +#define KVM_CAP_PRE_FAULT_MEMORY 236 struct kvm_irq_routing_irqchip { __u32 irqchip; @@ -1548,4 +1549,13 @@ struct kvm_create_guest_memfd { __u64 reserved[6]; }; +#define KVM_PRE_FAULT_MEMORY _IOWR(KVMIO, 0xd5, struct kvm_pre_fault_memory) + +struct kvm_pre_fault_memory { + __u64 gpa; + __u64 size; + __u64 flags; + __u64 padding[5]; +}; + #endif /* __LINUX_KVM_H */ diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index 754c6c923427..b14e14cdbfb9 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -67,6 +67,9 @@ config HAVE_KVM_INVALID_WAKEUPS config KVM_GENERIC_DIRTYLOG_READ_PROTECT bool +config KVM_GENERIC_PRE_FAULT_MEMORY + bool + config KVM_COMPAT def_bool y depends on KVM && COMPAT && !(S390 || ARM64 || RISCV) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 38b498669ef9..51d8dbe7e93b 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -4379,6 +4379,55 @@ static int kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu *vcpu) return fd; } +#ifdef CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY +static int kvm_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, + struct kvm_pre_fault_memory *range) +{ + int idx; + long r; + u64 full_size; + + if (range->flags) + return -EINVAL; + + if (!PAGE_ALIGNED(range->gpa) || + !PAGE_ALIGNED(range->size) || + range->gpa + range->size <= range->gpa) + return -EINVAL; + + if (!range->size) + return 0; + + vcpu_load(vcpu); + idx = srcu_read_lock(&vcpu->kvm->srcu); + + full_size = range->size; + do { + if (signal_pending(current)) { + r = -EINTR; + break; + } + + r = kvm_arch_vcpu_pre_fault_memory(vcpu, range); + if (r < 0) + break; + + if (WARN_ON_ONCE(r == 0)) + break; + + range->size -= r; + range->gpa += r; + cond_resched(); + } while (range->size); + + srcu_read_unlock(&vcpu->kvm->srcu, idx); + vcpu_put(vcpu); + + /* Return success if at least one page was mapped successfully. */ + return full_size == range->size ? r : 0; +} +#endif + static long kvm_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -4580,6 +4629,20 @@ static long kvm_vcpu_ioctl(struct file *filp, r = kvm_vcpu_ioctl_get_stats_fd(vcpu); break; } +#ifdef CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY + case KVM_PRE_FAULT_MEMORY: { + struct kvm_pre_fault_memory range; + + r = -EFAULT; + if (copy_from_user(&range, argp, sizeof(range))) + break; + r = kvm_vcpu_pre_fault_memory(vcpu, &range); + /* Pass back leftover range. */ + if (copy_to_user(argp, &range, sizeof(range))) + r = -EFAULT; + break; + } +#endif default: r = kvm_arch_vcpu_ioctl(filp, ioctl, arg); }