Message ID | 20190204181546.12095.81356.stgit@localhost.localdomain (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | kvm: Report unused guest pages to host | expand |
On Mon, Feb 04, 2019 at 10:15:46AM -0800, Alexander Duyck wrote: > From: Alexander Duyck <alexander.h.duyck@linux.intel.com> > > Add the host side of the KVM memory hinting support. With this we expose a > feature bit indicating that the host will pass the messages along to the > new madvise function. > > This functionality is mutually exclusive with device assignment. If a > device is assigned we will disable the functionality as it could lead to a > potential memory corruption if a device writes to a page after KVM has > flagged it as not being used. I really dislike this kind of tie-in. Yes right now assignment is not smart enough but generally you can protect the unused page in the IOMMU and that's it, it's safe. So the policy should not leak into host/guest interface. Instead it is better to just keep the pages pinned and ignore the hint for now. > The logic as it is currently defined limits the hint to only supporting a > hugepage or larger notifications. This is meant to help prevent us from > potentially breaking up huge pages by hinting that only a portion of the > page is not needed. > > Signed-off-by: Alexander Duyck <alexander.h.duyck@linux.intel.com> > --- > Documentation/virtual/kvm/cpuid.txt | 4 +++ > Documentation/virtual/kvm/hypercalls.txt | 14 ++++++++++++ > arch/x86/include/uapi/asm/kvm_para.h | 3 +++ > arch/x86/kvm/cpuid.c | 6 ++++- > arch/x86/kvm/x86.c | 35 ++++++++++++++++++++++++++++++ > include/uapi/linux/kvm_para.h | 1 + > 6 files changed, 62 insertions(+), 1 deletion(-) > > diff --git a/Documentation/virtual/kvm/cpuid.txt b/Documentation/virtual/kvm/cpuid.txt > index 97ca1940a0dc..fe3395a58b7e 100644 > --- a/Documentation/virtual/kvm/cpuid.txt > +++ b/Documentation/virtual/kvm/cpuid.txt > @@ -66,6 +66,10 @@ KVM_FEATURE_PV_SEND_IPI || 11 || guest checks this feature bit > || || before using paravirtualized > || || send IPIs. > ------------------------------------------------------------------------------ > +KVM_FEATURE_PV_UNUSED_PAGE_HINT || 12 || guest checks this feature bit > + || || before using paravirtualized > + || || unused page hints. > +------------------------------------------------------------------------------ > KVM_FEATURE_CLOCKSOURCE_STABLE_BIT || 24 || host will warn if no guest-side > || || per-cpu warps are expected in > || || kvmclock. > diff --git a/Documentation/virtual/kvm/hypercalls.txt b/Documentation/virtual/kvm/hypercalls.txt > index da24c138c8d1..b374678ac1f9 100644 > --- a/Documentation/virtual/kvm/hypercalls.txt > +++ b/Documentation/virtual/kvm/hypercalls.txt > @@ -141,3 +141,17 @@ a0 corresponds to the APIC ID in the third argument (a2), bit 1 > corresponds to the APIC ID a2+1, and so on. > > Returns the number of CPUs to which the IPIs were delivered successfully. > + > +7. KVM_HC_UNUSED_PAGE_HINT > +------------------------ > +Architecture: x86 > +Status: active > +Purpose: Send unused page hint to host > + > +a0: physical address of region unused, page aligned > +a1: size of unused region, page aligned > + > +The hypercall lets a guest send notifications to the host that it will no > +longer be using a given page in memory. Multiple pages can be hinted at by > +using the size field to hint that a higher order page is available by > +specifying the higher order page size. > diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h > index 19980ec1a316..f066c23060df 100644 > --- a/arch/x86/include/uapi/asm/kvm_para.h > +++ b/arch/x86/include/uapi/asm/kvm_para.h > @@ -29,6 +29,7 @@ > #define KVM_FEATURE_PV_TLB_FLUSH 9 > #define KVM_FEATURE_ASYNC_PF_VMEXIT 10 > #define KVM_FEATURE_PV_SEND_IPI 11 > +#define KVM_FEATURE_PV_UNUSED_PAGE_HINT 12 > > #define KVM_HINTS_REALTIME 0 > > @@ -119,4 +120,6 @@ struct kvm_vcpu_pv_apf_data { > #define KVM_PV_EOI_ENABLED KVM_PV_EOI_MASK > #define KVM_PV_EOI_DISABLED 0x0 > > +#define KVM_PV_UNUSED_PAGE_HINT_MIN_ORDER HUGETLB_PAGE_ORDER > + > #endif /* _UAPI_ASM_X86_KVM_PARA_H */ > diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c > index bbffa6c54697..b82bcbfbc420 100644 > --- a/arch/x86/kvm/cpuid.c > +++ b/arch/x86/kvm/cpuid.c > @@ -136,6 +136,9 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu) > if (kvm_hlt_in_guest(vcpu->kvm) && best && > (best->eax & (1 << KVM_FEATURE_PV_UNHALT))) > best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT); > + if (kvm_arch_has_assigned_device(vcpu->kvm) && best && > + (best->eax & KVM_FEATURE_PV_UNUSED_PAGE_HINT)) > + best->eax &= ~(1 << KVM_FEATURE_PV_UNUSED_PAGE_HINT); > > /* Update physical-address width */ > vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); > @@ -637,7 +640,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, > (1 << KVM_FEATURE_PV_UNHALT) | > (1 << KVM_FEATURE_PV_TLB_FLUSH) | > (1 << KVM_FEATURE_ASYNC_PF_VMEXIT) | > - (1 << KVM_FEATURE_PV_SEND_IPI); > + (1 << KVM_FEATURE_PV_SEND_IPI) | > + (1 << KVM_FEATURE_PV_UNUSED_PAGE_HINT); > > if (sched_info_on()) > entry->eax |= (1 << KVM_FEATURE_STEAL_TIME); > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index 3d27206f6c01..3ec75ab849e2 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -55,6 +55,7 @@ > #include <linux/irqbypass.h> > #include <linux/sched/stat.h> > #include <linux/mem_encrypt.h> > +#include <linux/mm.h> > > #include <trace/events/kvm.h> > > @@ -7052,6 +7053,37 @@ void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu) > kvm_x86_ops->refresh_apicv_exec_ctrl(vcpu); > } > > +static int kvm_pv_unused_page_hint_op(struct kvm *kvm, gpa_t gpa, size_t len) > +{ > + unsigned long start; > + > + /* > + * Guarantee the following: > + * len meets minimum size > + * len is a power of 2 > + * gpa is aligned to len > + */ > + if (len < (PAGE_SIZE << KVM_PV_UNUSED_PAGE_HINT_MIN_ORDER)) > + return -KVM_EINVAL; > + if (!is_power_of_2(len) || !IS_ALIGNED(gpa, len)) > + return -KVM_EINVAL; > + > + /* > + * If a device is assigned we cannot use use madvise as memory > + * is shared with the device and could lead to memory corruption > + * if the device writes to it after free. > + */ > + if (kvm_arch_has_assigned_device(kvm)) > + return -KVM_EOPNOTSUPP; > + > + start = gfn_to_hva(kvm, gpa_to_gfn(gpa)); > + > + if (kvm_is_error_hva(start + len)) > + return -KVM_EFAULT; > + > + return do_madvise_dontneed(start, len); > +} > + > int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) > { > unsigned long nr, a0, a1, a2, a3, ret; > @@ -7098,6 +7130,9 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) > case KVM_HC_SEND_IPI: > ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit); > break; > + case KVM_HC_UNUSED_PAGE_HINT: > + ret = kvm_pv_unused_page_hint_op(vcpu->kvm, a0, a1); > + break; > default: > ret = -KVM_ENOSYS; > break; > diff --git a/include/uapi/linux/kvm_para.h b/include/uapi/linux/kvm_para.h > index 6c0ce49931e5..75643b862a4e 100644 > --- a/include/uapi/linux/kvm_para.h > +++ b/include/uapi/linux/kvm_para.h > @@ -28,6 +28,7 @@ > #define KVM_HC_MIPS_CONSOLE_OUTPUT 8 > #define KVM_HC_CLOCK_PAIRING 9 > #define KVM_HC_SEND_IPI 10 > +#define KVM_HC_UNUSED_PAGE_HINT 11 > > /* > * hypercalls use architecture specific
On Sat, 2019-02-09 at 19:44 -0500, Michael S. Tsirkin wrote: > On Mon, Feb 04, 2019 at 10:15:46AM -0800, Alexander Duyck wrote: > > From: Alexander Duyck <alexander.h.duyck@linux.intel.com> > > > > Add the host side of the KVM memory hinting support. With this we expose a > > feature bit indicating that the host will pass the messages along to the > > new madvise function. > > > > This functionality is mutually exclusive with device assignment. If a > > device is assigned we will disable the functionality as it could lead to a > > potential memory corruption if a device writes to a page after KVM has > > flagged it as not being used. > > I really dislike this kind of tie-in. > > Yes right now assignment is not smart enough but generally > you can protect the unused page in the IOMMU and that's it, > it's safe. > > So the policy should not leak into host/guest interface. > Instead it is better to just keep the pages pinned and > ignore the hint for now. Okay, I can do that. It also gives me a means of benchmarking just the hypercall cost versus the extra page faults and zeroing.
On Mon, Feb 11, 2019 at 09:34:25AM -0800, Alexander Duyck wrote: > On Sat, 2019-02-09 at 19:44 -0500, Michael S. Tsirkin wrote: > > On Mon, Feb 04, 2019 at 10:15:46AM -0800, Alexander Duyck wrote: > > > From: Alexander Duyck <alexander.h.duyck@linux.intel.com> > > > > > > Add the host side of the KVM memory hinting support. With this we expose a > > > feature bit indicating that the host will pass the messages along to the > > > new madvise function. > > > > > > This functionality is mutually exclusive with device assignment. If a > > > device is assigned we will disable the functionality as it could lead to a > > > potential memory corruption if a device writes to a page after KVM has > > > flagged it as not being used. > > > > I really dislike this kind of tie-in. > > > > Yes right now assignment is not smart enough but generally > > you can protect the unused page in the IOMMU and that's it, > > it's safe. > > > > So the policy should not leak into host/guest interface. > > Instead it is better to just keep the pages pinned and > > ignore the hint for now. > > Okay, I can do that. It also gives me a means of benchmarking just the > hypercall cost versus the extra page faults and zeroing. Good point. Same goes for poisoning :)
On 2/9/19 4:44 PM, Michael S. Tsirkin wrote: > So the policy should not leak into host/guest interface. > Instead it is better to just keep the pages pinned and > ignore the hint for now. It does seems a bit silly to have guests forever hinting about freed memory when the host never has a hope of doing anything about it. Is that part fixable?
On Mon, Feb 11, 2019 at 09:41:19AM -0800, Dave Hansen wrote: > On 2/9/19 4:44 PM, Michael S. Tsirkin wrote: > > So the policy should not leak into host/guest interface. > > Instead it is better to just keep the pages pinned and > > ignore the hint for now. > > It does seems a bit silly to have guests forever hinting about freed > memory when the host never has a hope of doing anything about it. > > Is that part fixable? Yes just not with existing IOMMU APIs. It's in the paragraph just above that you cut out: Yes right now assignment is not smart enough but generally you can protect the unused page in the IOMMU and that's it, it's safe. So e.g. extern int iommu_remap(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, size_t size, int prot); I can elaborate if you like but generally we would need an API that allows you to atomically update a mapping for a specific page without perturbing the mapping for other pages.
On Mon, 2019-02-11 at 12:48 -0500, Michael S. Tsirkin wrote: > On Mon, Feb 11, 2019 at 09:41:19AM -0800, Dave Hansen wrote: > > On 2/9/19 4:44 PM, Michael S. Tsirkin wrote: > > > So the policy should not leak into host/guest interface. > > > Instead it is better to just keep the pages pinned and > > > ignore the hint for now. > > > > It does seems a bit silly to have guests forever hinting about freed > > memory when the host never has a hope of doing anything about it. > > > > Is that part fixable? > > > Yes just not with existing IOMMU APIs. > > It's in the paragraph just above that you cut out: > Yes right now assignment is not smart enough but generally > you can protect the unused page in the IOMMU and that's it, > it's safe. > > So e.g. > extern int iommu_remap(struct iommu_domain *domain, unsigned long iova, > phys_addr_t paddr, size_t size, int prot); > > > I can elaborate if you like but generally we would need an API that > allows you to atomically update a mapping for a specific page without > perturbing the mapping for other pages. > I still don't see how this would solve anything unless you have the guest somehow hinting on what pages it is providing to the devices. You would have to have the host invalidating the pages when the hint is provided, and have a new hint tied to arch_alloc_page that would rebuild the IOMMU mapping when a page is allocated. I'm pretty certain that the added cost of that would make the hinting pretty pointless as my experience has been that the IOMMU is too much of a bottleneck to have multiple CPUs trying to create and invalidate mappings simultaneously.
On Mon, Feb 11, 2019 at 10:30:10AM -0800, Alexander Duyck wrote: > On Mon, 2019-02-11 at 12:48 -0500, Michael S. Tsirkin wrote: > > On Mon, Feb 11, 2019 at 09:41:19AM -0800, Dave Hansen wrote: > > > On 2/9/19 4:44 PM, Michael S. Tsirkin wrote: > > > > So the policy should not leak into host/guest interface. > > > > Instead it is better to just keep the pages pinned and > > > > ignore the hint for now. > > > > > > It does seems a bit silly to have guests forever hinting about freed > > > memory when the host never has a hope of doing anything about it. > > > > > > Is that part fixable? > > > > > > Yes just not with existing IOMMU APIs. > > > > It's in the paragraph just above that you cut out: > > Yes right now assignment is not smart enough but generally > > you can protect the unused page in the IOMMU and that's it, > > it's safe. > > > > So e.g. > > extern int iommu_remap(struct iommu_domain *domain, unsigned long iova, > > phys_addr_t paddr, size_t size, int prot); > > > > > > I can elaborate if you like but generally we would need an API that > > allows you to atomically update a mapping for a specific page without > > perturbing the mapping for other pages. > > > > I still don't see how this would solve anything unless you have the > guest somehow hinting on what pages it is providing to the devices. > > You would have to have the host invalidating the pages when the hint is > provided, and have a new hint tied to arch_alloc_page that would > rebuild the IOMMU mapping when a page is allocated. > > I'm pretty certain that the added cost of that would make the hinting > pretty pointless as my experience has been that the IOMMU is too much > of a bottleneck to have multiple CPUs trying to create and invalidate > mappings simultaneously. I agree it's a concern. Another option would involve passing these hints in the DMA API. How about the option of removing the device by hotplug when host needs overcommit? That would involve either buffering on host, or requesting free pages after device is removed along the lines of existing balloon code. That btw seems to be an argument for making this hinting part of balloon.
diff --git a/Documentation/virtual/kvm/cpuid.txt b/Documentation/virtual/kvm/cpuid.txt index 97ca1940a0dc..fe3395a58b7e 100644 --- a/Documentation/virtual/kvm/cpuid.txt +++ b/Documentation/virtual/kvm/cpuid.txt @@ -66,6 +66,10 @@ KVM_FEATURE_PV_SEND_IPI || 11 || guest checks this feature bit || || before using paravirtualized || || send IPIs. ------------------------------------------------------------------------------ +KVM_FEATURE_PV_UNUSED_PAGE_HINT || 12 || guest checks this feature bit + || || before using paravirtualized + || || unused page hints. +------------------------------------------------------------------------------ KVM_FEATURE_CLOCKSOURCE_STABLE_BIT || 24 || host will warn if no guest-side || || per-cpu warps are expected in || || kvmclock. diff --git a/Documentation/virtual/kvm/hypercalls.txt b/Documentation/virtual/kvm/hypercalls.txt index da24c138c8d1..b374678ac1f9 100644 --- a/Documentation/virtual/kvm/hypercalls.txt +++ b/Documentation/virtual/kvm/hypercalls.txt @@ -141,3 +141,17 @@ a0 corresponds to the APIC ID in the third argument (a2), bit 1 corresponds to the APIC ID a2+1, and so on. Returns the number of CPUs to which the IPIs were delivered successfully. + +7. KVM_HC_UNUSED_PAGE_HINT +------------------------ +Architecture: x86 +Status: active +Purpose: Send unused page hint to host + +a0: physical address of region unused, page aligned +a1: size of unused region, page aligned + +The hypercall lets a guest send notifications to the host that it will no +longer be using a given page in memory. Multiple pages can be hinted at by +using the size field to hint that a higher order page is available by +specifying the higher order page size. diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h index 19980ec1a316..f066c23060df 100644 --- a/arch/x86/include/uapi/asm/kvm_para.h +++ b/arch/x86/include/uapi/asm/kvm_para.h @@ -29,6 +29,7 @@ #define KVM_FEATURE_PV_TLB_FLUSH 9 #define KVM_FEATURE_ASYNC_PF_VMEXIT 10 #define KVM_FEATURE_PV_SEND_IPI 11 +#define KVM_FEATURE_PV_UNUSED_PAGE_HINT 12 #define KVM_HINTS_REALTIME 0 @@ -119,4 +120,6 @@ struct kvm_vcpu_pv_apf_data { #define KVM_PV_EOI_ENABLED KVM_PV_EOI_MASK #define KVM_PV_EOI_DISABLED 0x0 +#define KVM_PV_UNUSED_PAGE_HINT_MIN_ORDER HUGETLB_PAGE_ORDER + #endif /* _UAPI_ASM_X86_KVM_PARA_H */ diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index bbffa6c54697..b82bcbfbc420 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -136,6 +136,9 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu) if (kvm_hlt_in_guest(vcpu->kvm) && best && (best->eax & (1 << KVM_FEATURE_PV_UNHALT))) best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT); + if (kvm_arch_has_assigned_device(vcpu->kvm) && best && + (best->eax & KVM_FEATURE_PV_UNUSED_PAGE_HINT)) + best->eax &= ~(1 << KVM_FEATURE_PV_UNUSED_PAGE_HINT); /* Update physical-address width */ vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); @@ -637,7 +640,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, (1 << KVM_FEATURE_PV_UNHALT) | (1 << KVM_FEATURE_PV_TLB_FLUSH) | (1 << KVM_FEATURE_ASYNC_PF_VMEXIT) | - (1 << KVM_FEATURE_PV_SEND_IPI); + (1 << KVM_FEATURE_PV_SEND_IPI) | + (1 << KVM_FEATURE_PV_UNUSED_PAGE_HINT); if (sched_info_on()) entry->eax |= (1 << KVM_FEATURE_STEAL_TIME); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3d27206f6c01..3ec75ab849e2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -55,6 +55,7 @@ #include <linux/irqbypass.h> #include <linux/sched/stat.h> #include <linux/mem_encrypt.h> +#include <linux/mm.h> #include <trace/events/kvm.h> @@ -7052,6 +7053,37 @@ void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu) kvm_x86_ops->refresh_apicv_exec_ctrl(vcpu); } +static int kvm_pv_unused_page_hint_op(struct kvm *kvm, gpa_t gpa, size_t len) +{ + unsigned long start; + + /* + * Guarantee the following: + * len meets minimum size + * len is a power of 2 + * gpa is aligned to len + */ + if (len < (PAGE_SIZE << KVM_PV_UNUSED_PAGE_HINT_MIN_ORDER)) + return -KVM_EINVAL; + if (!is_power_of_2(len) || !IS_ALIGNED(gpa, len)) + return -KVM_EINVAL; + + /* + * If a device is assigned we cannot use use madvise as memory + * is shared with the device and could lead to memory corruption + * if the device writes to it after free. + */ + if (kvm_arch_has_assigned_device(kvm)) + return -KVM_EOPNOTSUPP; + + start = gfn_to_hva(kvm, gpa_to_gfn(gpa)); + + if (kvm_is_error_hva(start + len)) + return -KVM_EFAULT; + + return do_madvise_dontneed(start, len); +} + int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) { unsigned long nr, a0, a1, a2, a3, ret; @@ -7098,6 +7130,9 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) case KVM_HC_SEND_IPI: ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit); break; + case KVM_HC_UNUSED_PAGE_HINT: + ret = kvm_pv_unused_page_hint_op(vcpu->kvm, a0, a1); + break; default: ret = -KVM_ENOSYS; break; diff --git a/include/uapi/linux/kvm_para.h b/include/uapi/linux/kvm_para.h index 6c0ce49931e5..75643b862a4e 100644 --- a/include/uapi/linux/kvm_para.h +++ b/include/uapi/linux/kvm_para.h @@ -28,6 +28,7 @@ #define KVM_HC_MIPS_CONSOLE_OUTPUT 8 #define KVM_HC_CLOCK_PAIRING 9 #define KVM_HC_SEND_IPI 10 +#define KVM_HC_UNUSED_PAGE_HINT 11 /* * hypercalls use architecture specific