diff mbox series

[1/6] KVM: x86: Add ioctl for accepting a userspace provided MSR list

Message ID 20200804042043.3592620-2-aaronlewis@google.com (mailing list archive)
State New, archived
Headers show
Series Allow userspace to manage MSRs | expand

Commit Message

Aaron Lewis Aug. 4, 2020, 4:20 a.m. UTC
Add KVM_SET_EXIT_MSRS ioctl to allow userspace to pass in a list of MSRs
that force an exit to userspace when rdmsr or wrmsr are used by the
guest.

KVM_SET_EXIT_MSRS will need to be called before any vCPUs are
created to protect the 'user_exit_msrs' list from being mutated while
vCPUs are running.

Add KVM_CAP_SET_MSR_EXITS to identify the feature exists.

Signed-off-by: Aaron Lewis <aaronlewis@google.com>
Reviewed-by: Oliver Upton <oupton@google.com>
---
 Documentation/virt/kvm/api.rst  | 24 +++++++++++++++++++
 arch/x86/include/asm/kvm_host.h |  2 ++
 arch/x86/kvm/x86.c              | 41 +++++++++++++++++++++++++++++++++
 include/uapi/linux/kvm.h        |  2 ++
 4 files changed, 69 insertions(+)

Comments

Alexander Graf Aug. 7, 2020, 4:12 p.m. UTC | #1
On 04.08.20 06:20, Aaron Lewis wrote:
> CAUTION: This email originated from outside of the organization. Do not click links or open attachments unless you can confirm the sender and know the content is safe.
> 
> 
> 
> Add KVM_SET_EXIT_MSRS ioctl to allow userspace to pass in a list of MSRs
> that force an exit to userspace when rdmsr or wrmsr are used by the
> guest.
> 
> KVM_SET_EXIT_MSRS will need to be called before any vCPUs are
> created to protect the 'user_exit_msrs' list from being mutated while
> vCPUs are running.
> 
> Add KVM_CAP_SET_MSR_EXITS to identify the feature exists.
> 
> Signed-off-by: Aaron Lewis <aaronlewis@google.com>
> Reviewed-by: Oliver Upton <oupton@google.com>
> ---
>   Documentation/virt/kvm/api.rst  | 24 +++++++++++++++++++
>   arch/x86/include/asm/kvm_host.h |  2 ++
>   arch/x86/kvm/x86.c              | 41 +++++++++++++++++++++++++++++++++
>   include/uapi/linux/kvm.h        |  2 ++
>   4 files changed, 69 insertions(+)
> 
> diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
> index 320788f81a05..7d8167c165aa 100644
> --- a/Documentation/virt/kvm/api.rst
> +++ b/Documentation/virt/kvm/api.rst
> @@ -1006,6 +1006,30 @@ such as migration.
>   :Parameters: struct kvm_vcpu_event (out)
>   :Returns: 0 on success, -1 on error
> 
> +4.32 KVM_SET_EXIT_MSRS
> +------------------
> +
> +:Capability: KVM_CAP_SET_MSR_EXITS
> +:Architectures: x86
> +:Type: vm ioctl
> +:Parameters: struct kvm_msr_list (in)
> +:Returns: 0 on success, -1 on error
> +
> +Sets the userspace MSR list which is used to track which MSRs KVM should send
> +to userspace to be serviced when the guest executes rdmsr or wrmsr.

Unfortunately this doesn't solve the whole "ignore_msrs" mess that we 
have today. How can I just say "tell user space about unhandled MSRs"? 
And if I add that on top of this mechanism, how do we not make the list 
of MSRs that are handled in-kernel an ABI?

> +
> +This ioctl needs to be called before vCPUs are setup otherwise the list of MSRs
> +will not be accepted and an EINVAL error will be returned.  Also, if a list of
> +MSRs has already been supplied, and this ioctl is called again an EEXIST error
> +will be returned.
> +
> +::
> +
> +  struct kvm_msr_list {
> +  __u32 nmsrs;
> +  __u32 indices[0];
> +};
> +
>   X86:
>   ^^^^
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index be5363b21540..510055471dd0 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1004,6 +1004,8 @@ struct kvm_arch {
> 
>          struct kvm_pmu_event_filter *pmu_event_filter;
>          struct task_struct *nx_lpage_recovery_thread;
> +
> +       struct kvm_msr_list *user_exit_msrs;
>   };
> 
>   struct kvm_vm_stat {
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 88c593f83b28..46a0fb9e0869 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -3419,6 +3419,42 @@ static inline bool kvm_can_mwait_in_guest(void)
>                  boot_cpu_has(X86_FEATURE_ARAT);
>   }
> 
> +static int kvm_vm_ioctl_set_exit_msrs(struct kvm *kvm,
> +                                     struct kvm_msr_list __user *user_msr_list)
> +{
> +       struct kvm_msr_list *msr_list, hdr;
> +       size_t indices_size;
> +
> +       if (kvm->arch.user_exit_msrs != NULL)
> +               return -EEXIST;
> +
> +       if (kvm->created_vcpus)
> +               return -EINVAL;

What if we need to change the set of user space handled MSRs 
dynamically, for example because a feature has been enabled through a 
previous MSR?

> +
> +       if (copy_from_user(&hdr, user_msr_list, sizeof(hdr)))
> +               return -EFAULT;
> +
> +       if (hdr.nmsrs >= MAX_IO_MSRS)

This constant is not defined inside the scope of this patch. Is your 
patchset fully bisectable? Please make sure that every patch builds 
successfully on its own :).


Alex

> +               return -E2BIG;
> +
> +       indices_size = sizeof(hdr.indices[0]) * hdr.nmsrs;
> +       msr_list = kvzalloc(sizeof(struct kvm_msr_list) + indices_size,
> +                           GFP_KERNEL_ACCOUNT);
> +       if (!msr_list)
> +               return -ENOMEM;
> +       msr_list->nmsrs = hdr.nmsrs;
> +
> +       if (copy_from_user(msr_list->indices, user_msr_list->indices,
> +                          indices_size)) {
> +               kvfree(msr_list);
> +               return -EFAULT;
> +       }
> +
> +       kvm->arch.user_exit_msrs = msr_list;
> +
> +       return 0;
> +}
> +
>   int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
>   {
>          int r = 0;
> @@ -3476,6 +3512,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
>          case KVM_CAP_MSR_PLATFORM_INFO:
>          case KVM_CAP_EXCEPTION_PAYLOAD:
>          case KVM_CAP_SET_GUEST_DEBUG:
> +       case KVM_CAP_SET_MSR_EXITS:
>                  r = 1;
>                  break;
>          case KVM_CAP_SYNC_REGS:
> @@ -5261,6 +5298,10 @@ long kvm_arch_vm_ioctl(struct file *filp,
>                  r = 0;
>                  break;
>          }
> +       case KVM_SET_EXIT_MSRS: {
> +               r = kvm_vm_ioctl_set_exit_msrs(kvm, argp);
> +               break;
> +       }
>          case KVM_MEMORY_ENCRYPT_OP: {
>                  r = -ENOTTY;
>                  if (kvm_x86_ops.mem_enc_op)
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 4fdf30316582..de4638c1bd15 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -1031,6 +1031,7 @@ struct kvm_ppc_resize_hpt {
>   #define KVM_CAP_PPC_SECURE_GUEST 181
>   #define KVM_CAP_HALT_POLL 182
>   #define KVM_CAP_ASYNC_PF_INT 183
> +#define KVM_CAP_SET_MSR_EXITS 185
> 
>   #ifdef KVM_CAP_IRQ_ROUTING
> 
> @@ -1371,6 +1372,7 @@ struct kvm_s390_ucas_mapping {
>   /* Available with KVM_CAP_PMU_EVENT_FILTER */
>   #define KVM_SET_PMU_EVENT_FILTER  _IOW(KVMIO,  0xb2, struct kvm_pmu_event_filter)
>   #define KVM_PPC_SVM_OFF                  _IO(KVMIO,  0xb3)
> +#define KVM_SET_EXIT_MSRS      _IOW(KVMIO, 0xb4, struct kvm_msr_list)
> 
>   /* ioctl for vm fd */
>   #define KVM_CREATE_DEVICE        _IOWR(KVMIO,  0xe0, struct kvm_create_device)
> --
> 2.28.0.163.g6104cc2f0b6-goog
> 



Amazon Development Center Germany GmbH
Krausenstr. 38
10117 Berlin
Geschaeftsfuehrung: Christian Schlaeger, Jonathan Weiss
Eingetragen am Amtsgericht Charlottenburg unter HRB 149173 B
Sitz: Berlin
Ust-ID: DE 289 237 879
Jim Mattson Aug. 7, 2020, 9:16 p.m. UTC | #2
On Fri, Aug 7, 2020 at 9:12 AM Alexander Graf <graf@amazon.com> wrote:
>
>
>
> On 04.08.20 06:20, Aaron Lewis wrote:
> > CAUTION: This email originated from outside of the organization. Do not click links or open attachments unless you can confirm the sender and know the content is safe.
> >
> >
> >
> > Add KVM_SET_EXIT_MSRS ioctl to allow userspace to pass in a list of MSRs
> > that force an exit to userspace when rdmsr or wrmsr are used by the
> > guest.
> >
> > KVM_SET_EXIT_MSRS will need to be called before any vCPUs are
> > created to protect the 'user_exit_msrs' list from being mutated while
> > vCPUs are running.
> >
> > Add KVM_CAP_SET_MSR_EXITS to identify the feature exists.
> >
> > Signed-off-by: Aaron Lewis <aaronlewis@google.com>
> > Reviewed-by: Oliver Upton <oupton@google.com>
> > ---
> >   Documentation/virt/kvm/api.rst  | 24 +++++++++++++++++++
> >   arch/x86/include/asm/kvm_host.h |  2 ++
> >   arch/x86/kvm/x86.c              | 41 +++++++++++++++++++++++++++++++++
> >   include/uapi/linux/kvm.h        |  2 ++
> >   4 files changed, 69 insertions(+)
> >
> > diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
> > index 320788f81a05..7d8167c165aa 100644
> > --- a/Documentation/virt/kvm/api.rst
> > +++ b/Documentation/virt/kvm/api.rst
> > @@ -1006,6 +1006,30 @@ such as migration.
> >   :Parameters: struct kvm_vcpu_event (out)
> >   :Returns: 0 on success, -1 on error
> >
> > +4.32 KVM_SET_EXIT_MSRS
> > +------------------
> > +
> > +:Capability: KVM_CAP_SET_MSR_EXITS
> > +:Architectures: x86
> > +:Type: vm ioctl
> > +:Parameters: struct kvm_msr_list (in)
> > +:Returns: 0 on success, -1 on error
> > +
> > +Sets the userspace MSR list which is used to track which MSRs KVM should send
> > +to userspace to be serviced when the guest executes rdmsr or wrmsr.
>
> Unfortunately this doesn't solve the whole "ignore_msrs" mess that we
> have today. How can I just say "tell user space about unhandled MSRs"?
> And if I add that on top of this mechanism, how do we not make the list
> of MSRs that are handled in-kernel an ABI?

Jumping in for Aaron, who is out this afternoon...

This patch doesn't attempt to solve your problem, "tell user space
about unhandled MSRs." It attempts to solve our problem, "tell user
space about a specific set of MSRs, even if kvm learns to handle them
in the future." This is, in fact, what we really wanted to do when
Peter Hornyack implemented the "tell user space about unhandled MSRs"
changes in 2015. We just didn't realize it at the time.

Though your proposal does partially solve our problem, it's a lot
easier to specify a small set of MSRs by inclusion than it is by
exclusion. (Where your proposal falls short of our needs is when
userspace wants to handle an MSR that kvm would not typically
intercept at all.)

> > +
> > +This ioctl needs to be called before vCPUs are setup otherwise the list of MSRs
> > +will not be accepted and an EINVAL error will be returned.  Also, if a list of
> > +MSRs has already been supplied, and this ioctl is called again an EEXIST error
> > +will be returned.
> > +
> > +::
> > +
> > +  struct kvm_msr_list {
> > +  __u32 nmsrs;
> > +  __u32 indices[0];
> > +};
> > +
> >   X86:
> >   ^^^^
> >
> > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > index be5363b21540..510055471dd0 100644
> > --- a/arch/x86/include/asm/kvm_host.h
> > +++ b/arch/x86/include/asm/kvm_host.h
> > @@ -1004,6 +1004,8 @@ struct kvm_arch {
> >
> >          struct kvm_pmu_event_filter *pmu_event_filter;
> >          struct task_struct *nx_lpage_recovery_thread;
> > +
> > +       struct kvm_msr_list *user_exit_msrs;
> >   };
> >
> >   struct kvm_vm_stat {
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index 88c593f83b28..46a0fb9e0869 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -3419,6 +3419,42 @@ static inline bool kvm_can_mwait_in_guest(void)
> >                  boot_cpu_has(X86_FEATURE_ARAT);
> >   }
> >
> > +static int kvm_vm_ioctl_set_exit_msrs(struct kvm *kvm,
> > +                                     struct kvm_msr_list __user *user_msr_list)
> > +{
> > +       struct kvm_msr_list *msr_list, hdr;
> > +       size_t indices_size;
> > +
> > +       if (kvm->arch.user_exit_msrs != NULL)
> > +               return -EEXIST;
> > +
> > +       if (kvm->created_vcpus)
> > +               return -EINVAL;
>
> What if we need to change the set of user space handled MSRs
> dynamically, for example because a feature has been enabled through a
> previous MSR?

It's never been an issue for us, and this approach avoids the
messiness of having to back out the old changes to the MSR permissions
bitmaps, which is fraught. It can be done, but I would question the
ROI on the additional complexity. In any case, I think a VCPU ioctl
would be more appropriate than a VM ioctl if dynamic modifications
were allowed. For instance, in your example, the previous MSR write
that enabled a feature requiring a new user space MSR exit would
likely apply only to the VCPU on which that previous MSR write
happened.

> > +
> > +       if (copy_from_user(&hdr, user_msr_list, sizeof(hdr)))
> > +               return -EFAULT;
> > +
> > +       if (hdr.nmsrs >= MAX_IO_MSRS)
>
> This constant is not defined inside the scope of this patch. Is your
> patchset fully bisectable? Please make sure that every patch builds
> successfully on its own :).

I can't vouch for this patchset being fully bisectable, but this
particular constant was moved to x86.c ~13 years ago, in commit
313a3dc75da20 ("KVM: Portability: split kvm_vcpu_ioctl"). It should
not cause any build issues. :)
Alexander Graf Aug. 7, 2020, 10:49 p.m. UTC | #3
On 07.08.20 23:16, Jim Mattson wrote:
> 
> On Fri, Aug 7, 2020 at 9:12 AM Alexander Graf <graf@amazon.com> wrote:
>>
>>
>>
>> On 04.08.20 06:20, Aaron Lewis wrote:
>>> CAUTION: This email originated from outside of the organization. Do not click links or open attachments unless you can confirm the sender and know the content is safe.
>>>
>>>
>>>
>>> Add KVM_SET_EXIT_MSRS ioctl to allow userspace to pass in a list of MSRs
>>> that force an exit to userspace when rdmsr or wrmsr are used by the
>>> guest.
>>>
>>> KVM_SET_EXIT_MSRS will need to be called before any vCPUs are
>>> created to protect the 'user_exit_msrs' list from being mutated while
>>> vCPUs are running.
>>>
>>> Add KVM_CAP_SET_MSR_EXITS to identify the feature exists.
>>>
>>> Signed-off-by: Aaron Lewis <aaronlewis@google.com>
>>> Reviewed-by: Oliver Upton <oupton@google.com>
>>> ---
>>>    Documentation/virt/kvm/api.rst  | 24 +++++++++++++++++++
>>>    arch/x86/include/asm/kvm_host.h |  2 ++
>>>    arch/x86/kvm/x86.c              | 41 +++++++++++++++++++++++++++++++++
>>>    include/uapi/linux/kvm.h        |  2 ++
>>>    4 files changed, 69 insertions(+)
>>>
>>> diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
>>> index 320788f81a05..7d8167c165aa 100644
>>> --- a/Documentation/virt/kvm/api.rst
>>> +++ b/Documentation/virt/kvm/api.rst
>>> @@ -1006,6 +1006,30 @@ such as migration.
>>>    :Parameters: struct kvm_vcpu_event (out)
>>>    :Returns: 0 on success, -1 on error
>>>
>>> +4.32 KVM_SET_EXIT_MSRS
>>> +------------------
>>> +
>>> +:Capability: KVM_CAP_SET_MSR_EXITS
>>> +:Architectures: x86
>>> +:Type: vm ioctl
>>> +:Parameters: struct kvm_msr_list (in)
>>> +:Returns: 0 on success, -1 on error
>>> +
>>> +Sets the userspace MSR list which is used to track which MSRs KVM should send
>>> +to userspace to be serviced when the guest executes rdmsr or wrmsr.
>>
>> Unfortunately this doesn't solve the whole "ignore_msrs" mess that we
>> have today. How can I just say "tell user space about unhandled MSRs"?
>> And if I add that on top of this mechanism, how do we not make the list
>> of MSRs that are handled in-kernel an ABI?
> 
> Jumping in for Aaron, who is out this afternoon...

Awesome, thanks for the super quick reply!

> 
> This patch doesn't attempt to solve your problem, "tell user space
> about unhandled MSRs." It attempts to solve our problem, "tell user
> space about a specific set of MSRs, even if kvm learns to handle them
> in the future." This is, in fact, what we really wanted to do when
> Peter Hornyack implemented the "tell user space about unhandled MSRs"
> changes in 2015. We just didn't realize it at the time.

Ok, let's take a step back then. What exactly are you trying to solve 
and which MSRs do you care about?

My main problem with a deny list approach is that it's (practically) 
impossible to map the allow list use case onto it. It is however trivial 
to only deny a few select MSRs explicitly with an allow list approach. I 
don't want to introduce yet another ABI to KVM in 2 years to then have 
both :).

> Though your proposal does partially solve our problem, it's a lot
> easier to specify a small set of MSRs by inclusion than it is by
> exclusion. (Where your proposal falls short of our needs is when
> userspace wants to handle an MSR that kvm would not typically
> intercept at all.)

The only MSRs that KVM does not intercept are the ones explicitly 
specified as pass-through

> 
>>> +
>>> +This ioctl needs to be called before vCPUs are setup otherwise the list of MSRs
>>> +will not be accepted and an EINVAL error will be returned.  Also, if a list of
>>> +MSRs has already been supplied, and this ioctl is called again an EEXIST error
>>> +will be returned.
>>> +
>>> +::
>>> +
>>> +  struct kvm_msr_list {
>>> +  __u32 nmsrs;
>>> +  __u32 indices[0];
>>> +};
>>> +
>>>    X86:
>>>    ^^^^
>>>
>>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
>>> index be5363b21540..510055471dd0 100644
>>> --- a/arch/x86/include/asm/kvm_host.h
>>> +++ b/arch/x86/include/asm/kvm_host.h
>>> @@ -1004,6 +1004,8 @@ struct kvm_arch {
>>>
>>>           struct kvm_pmu_event_filter *pmu_event_filter;
>>>           struct task_struct *nx_lpage_recovery_thread;
>>> +
>>> +       struct kvm_msr_list *user_exit_msrs;
>>>    };
>>>
>>>    struct kvm_vm_stat {
>>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>>> index 88c593f83b28..46a0fb9e0869 100644
>>> --- a/arch/x86/kvm/x86.c
>>> +++ b/arch/x86/kvm/x86.c
>>> @@ -3419,6 +3419,42 @@ static inline bool kvm_can_mwait_in_guest(void)
>>>                   boot_cpu_has(X86_FEATURE_ARAT);
>>>    }
>>>
>>> +static int kvm_vm_ioctl_set_exit_msrs(struct kvm *kvm,
>>> +                                     struct kvm_msr_list __user *user_msr_list)
>>> +{
>>> +       struct kvm_msr_list *msr_list, hdr;
>>> +       size_t indices_size;
>>> +
>>> +       if (kvm->arch.user_exit_msrs != NULL)
>>> +               return -EEXIST;
>>> +
>>> +       if (kvm->created_vcpus)
>>> +               return -EINVAL;
>>
>> What if we need to change the set of user space handled MSRs
>> dynamically, for example because a feature has been enabled through a
>> previous MSR?
> 
> It's never been an issue for us, and this approach avoids the
> messiness of having to back out the old changes to the MSR permissions
> bitmaps, which is fraught. It can be done, but I would question the
> ROI on the additional complexity. In any case, I think a VCPU ioctl
> would be more appropriate than a VM ioctl if dynamic modifications
> were allowed. For instance, in your example, the previous MSR write
> that enabled a feature requiring a new user space MSR exit would
> likely apply only to the VCPU on which that previous MSR write
> happened.

My general rule of thumb with PPC has always been to make as much vCPU 
specific as possible. X86 KVM is a bit different there - and I guess it 
does help for memory consumption :).

> 
>>> +
>>> +       if (copy_from_user(&hdr, user_msr_list, sizeof(hdr)))
>>> +               return -EFAULT;
>>> +
>>> +       if (hdr.nmsrs >= MAX_IO_MSRS)
>>
>> This constant is not defined inside the scope of this patch. Is your
>> patchset fully bisectable? Please make sure that every patch builds
>> successfully on its own :).
> 
> I can't vouch for this patchset being fully bisectable, but this
> particular constant was moved to x86.c ~13 years ago, in commit
> 313a3dc75da20 ("KVM: Portability: split kvm_vcpu_ioctl"). It should
> not cause any build issues. :)

Ugh, sorry, my bad :).


Alex



Amazon Development Center Germany GmbH
Krausenstr. 38
10117 Berlin
Geschaeftsfuehrung: Christian Schlaeger, Jonathan Weiss
Eingetragen am Amtsgericht Charlottenburg unter HRB 149173 B
Sitz: Berlin
Ust-ID: DE 289 237 879
Jim Mattson Aug. 8, 2020, 12:39 a.m. UTC | #4
On Fri, Aug 7, 2020 at 3:49 PM Alexander Graf <graf@amazon.com> wrote:
>
> On 07.08.20 23:16, Jim Mattson wrote:
> > This patch doesn't attempt to solve your problem, "tell user space
> > about unhandled MSRs." It attempts to solve our problem, "tell user
> > space about a specific set of MSRs, even if kvm learns to handle them
> > in the future." This is, in fact, what we really wanted to do when
> > Peter Hornyack implemented the "tell user space about unhandled MSRs"
> > changes in 2015. We just didn't realize it at the time.
>
> Ok, let's take a step back then. What exactly are you trying to solve
> and which MSRs do you care about?

Our userspace handles a handful of MSRs, including some of the
synthetic PV time-related MSRs (both kvm and Hyper-V), which we want
to exit to userspace even though kvm thinks it knows what to do with
them. We also have a range of Google-specific MSRs that kvm will
probably never know about. And, finally, we find that as time goes on,
there are situations where we want to add support for a new MSR in
userspace because we can roll out a new userspace more quickly than a
new kernel. An example of this last category is
MSR_IA32_ARCH_CAPABILITIES. We don't necessarily want to cede control
of such MSRs to kvm as soon as it knows about them. For instance, we
may want to wait until the new kernel reaches its rollback horizon.
There may be a few more MSRs we handle in userspace, but I think that
covers the bulk of them.

I don't believe we have yet seen a case where we wanted to take
control of an MSR that kvm didn't intercept itself. That part of
Aaron's patch may be overkill, but it gives the API clean semantics.
It seems a bit awkward to have a userspace MSR list where accesses to
some of the MSRs exit to userspace and accesses to others do not. (I
believe that your implementation of the allow list suffers from this
awkwardness, though I haven't yet had time to review it in depth.)

> My main problem with a deny list approach is that it's (practically)
> impossible to map the allow list use case onto it. It is however trivial
> to only deny a few select MSRs explicitly with an allow list approach. I
> don't want to introduce yet another ABI to KVM in 2 years to then have
> both :).

I agree in part. A deny list does not support your use case well. But
does an allow list support it that much better? I suspect that the
allow list that specifies "every MSR that kvm knows about today" is
far from trivial to construct, especially if you consider different
microarchitectures and different VM configurations (e.g. vPMU vs. no
vPMU).

Can we back up and ask what it is that you want to accomplish? If you
simply want ignore_msrs to be per-VM, wouldn't it be easier to add an
ioctl to simply set a boolean that potentially overrides the module
parameter, and then modify the ignore_msrs code to consult the per-VM
boolean?

Getting back to one of the issues I raised earlier: your "exit instead
of #GP" patch exits to userspace on both unknown MSRs and illegal
WRMSR values. If userspace wants to implement "ignore MSRs" for a
particular VM, it's absolutely trivial when the reason for the exit is
"unknown MSR." However, it becomes more complicated when the reason
for the exit is "#GP." Now, userspace has to know which MSRs are known
to kvm before it can decide whether or not the access should #GP.

I hope this doesn't come across sounding antagonistic. Like you, I
want to avoid revisiting this design a few years down the road. We're
already in that boat with the 2015 "exit on unknown MSR" change!
diff mbox series

Patch

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 320788f81a05..7d8167c165aa 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -1006,6 +1006,30 @@  such as migration.
 :Parameters: struct kvm_vcpu_event (out)
 :Returns: 0 on success, -1 on error
 
+4.32 KVM_SET_EXIT_MSRS
+------------------
+
+:Capability: KVM_CAP_SET_MSR_EXITS
+:Architectures: x86
+:Type: vm ioctl
+:Parameters: struct kvm_msr_list (in)
+:Returns: 0 on success, -1 on error
+
+Sets the userspace MSR list which is used to track which MSRs KVM should send
+to userspace to be serviced when the guest executes rdmsr or wrmsr.
+
+This ioctl needs to be called before vCPUs are setup otherwise the list of MSRs
+will not be accepted and an EINVAL error will be returned.  Also, if a list of
+MSRs has already been supplied, and this ioctl is called again an EEXIST error
+will be returned.
+
+::
+
+  struct kvm_msr_list {
+  __u32 nmsrs;
+  __u32 indices[0];
+};
+
 X86:
 ^^^^
 
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index be5363b21540..510055471dd0 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1004,6 +1004,8 @@  struct kvm_arch {
 
 	struct kvm_pmu_event_filter *pmu_event_filter;
 	struct task_struct *nx_lpage_recovery_thread;
+
+	struct kvm_msr_list *user_exit_msrs;
 };
 
 struct kvm_vm_stat {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 88c593f83b28..46a0fb9e0869 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3419,6 +3419,42 @@  static inline bool kvm_can_mwait_in_guest(void)
 		boot_cpu_has(X86_FEATURE_ARAT);
 }
 
+static int kvm_vm_ioctl_set_exit_msrs(struct kvm *kvm,
+				      struct kvm_msr_list __user *user_msr_list)
+{
+	struct kvm_msr_list *msr_list, hdr;
+	size_t indices_size;
+
+	if (kvm->arch.user_exit_msrs != NULL)
+		return -EEXIST;
+
+	if (kvm->created_vcpus)
+		return -EINVAL;
+
+	if (copy_from_user(&hdr, user_msr_list, sizeof(hdr)))
+		return -EFAULT;
+
+	if (hdr.nmsrs >= MAX_IO_MSRS)
+		return -E2BIG;
+
+	indices_size = sizeof(hdr.indices[0]) * hdr.nmsrs;
+	msr_list = kvzalloc(sizeof(struct kvm_msr_list) + indices_size,
+			    GFP_KERNEL_ACCOUNT);
+	if (!msr_list)
+		return -ENOMEM;
+	msr_list->nmsrs = hdr.nmsrs;
+
+	if (copy_from_user(msr_list->indices, user_msr_list->indices,
+			   indices_size)) {
+		kvfree(msr_list);
+		return -EFAULT;
+	}
+
+	kvm->arch.user_exit_msrs = msr_list;
+
+	return 0;
+}
+
 int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 {
 	int r = 0;
@@ -3476,6 +3512,7 @@  int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_MSR_PLATFORM_INFO:
 	case KVM_CAP_EXCEPTION_PAYLOAD:
 	case KVM_CAP_SET_GUEST_DEBUG:
+	case KVM_CAP_SET_MSR_EXITS:
 		r = 1;
 		break;
 	case KVM_CAP_SYNC_REGS:
@@ -5261,6 +5298,10 @@  long kvm_arch_vm_ioctl(struct file *filp,
 		r = 0;
 		break;
 	}
+	case KVM_SET_EXIT_MSRS: {
+		r = kvm_vm_ioctl_set_exit_msrs(kvm, argp);
+		break;
+	}
 	case KVM_MEMORY_ENCRYPT_OP: {
 		r = -ENOTTY;
 		if (kvm_x86_ops.mem_enc_op)
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 4fdf30316582..de4638c1bd15 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1031,6 +1031,7 @@  struct kvm_ppc_resize_hpt {
 #define KVM_CAP_PPC_SECURE_GUEST 181
 #define KVM_CAP_HALT_POLL 182
 #define KVM_CAP_ASYNC_PF_INT 183
+#define KVM_CAP_SET_MSR_EXITS 185
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1371,6 +1372,7 @@  struct kvm_s390_ucas_mapping {
 /* Available with KVM_CAP_PMU_EVENT_FILTER */
 #define KVM_SET_PMU_EVENT_FILTER  _IOW(KVMIO,  0xb2, struct kvm_pmu_event_filter)
 #define KVM_PPC_SVM_OFF		  _IO(KVMIO,  0xb3)
+#define KVM_SET_EXIT_MSRS	_IOW(KVMIO, 0xb4, struct kvm_msr_list)
 
 /* ioctl for vm fd */
 #define KVM_CREATE_DEVICE	  _IOWR(KVMIO,  0xe0, struct kvm_create_device)