diff mbox series

[2/2] KVM: Protect vCPU's "last run PID" with rwlock, not RCU

Message ID 20240802200136.329973-3-seanjc@google.com (mailing list archive)
State New, archived
Headers show
Series KVM: Protect vCPU's PID with a rwlock | expand

Commit Message

Sean Christopherson Aug. 2, 2024, 8:01 p.m. UTC
To avoid jitter on KVM_RUN due to synchronize_rcu(), use a rwlock instead
of RCU to protect vcpu->pid, a.k.a. the pid of the task last used to a
vCPU.  When userspace is doing M:N scheduling of tasks to vCPUs, e.g. to
run SEV migration helper vCPUs during post-copy, the synchronize_rcu()
needed to change the PID associated with the vCPU can stall for hundreds
of milliseconds, which is problematic for latency sensitive post-copy
operations.

In the directed yield path, do not acquire the lock if it's contended,
i.e. if the associated PID is changing, as that means the vCPU's task is
already running.

Reported-by: Steve Rutherford <srutherford@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/arm64/include/asm/kvm_host.h |  2 +-
 include/linux/kvm_host.h          |  3 ++-
 virt/kvm/kvm_main.c               | 32 +++++++++++++++++--------------
 3 files changed, 21 insertions(+), 16 deletions(-)

Comments

Steve Rutherford Aug. 2, 2024, 8:28 p.m. UTC | #1
On Fri, Aug 2, 2024 at 1:01 PM Sean Christopherson <seanjc@google.com> wrote:
>
> To avoid jitter on KVM_RUN due to synchronize_rcu(), use a rwlock instead
> of RCU to protect vcpu->pid, a.k.a. the pid of the task last used to a
> vCPU.  When userspace is doing M:N scheduling of tasks to vCPUs, e.g. to
> run SEV migration helper vCPUs during post-copy, the synchronize_rcu()
> needed to change the PID associated with the vCPU can stall for hundreds
> of milliseconds, which is problematic for latency sensitive post-copy
> operations.
>
> In the directed yield path, do not acquire the lock if it's contended,
> i.e. if the associated PID is changing, as that means the vCPU's task is
> already running.
>
> Reported-by: Steve Rutherford <srutherford@google.com>
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> ---
>  arch/arm64/include/asm/kvm_host.h |  2 +-
>  include/linux/kvm_host.h          |  3 ++-
>  virt/kvm/kvm_main.c               | 32 +++++++++++++++++--------------
>  3 files changed, 21 insertions(+), 16 deletions(-)
>
> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> index a33f5996ca9f..7199cb014806 100644
> --- a/arch/arm64/include/asm/kvm_host.h
> +++ b/arch/arm64/include/asm/kvm_host.h
> @@ -1115,7 +1115,7 @@ int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu,
>  void kvm_arm_halt_guest(struct kvm *kvm);
>  void kvm_arm_resume_guest(struct kvm *kvm);
>
> -#define vcpu_has_run_once(vcpu)        !!rcu_access_pointer((vcpu)->pid)
> +#define vcpu_has_run_once(vcpu)        (!!READ_ONCE((vcpu)->pid))
>
>  #ifndef __KVM_NVHE_HYPERVISOR__
>  #define kvm_call_hyp_nvhe(f, ...)                                              \
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 689e8be873a7..d6f4e8b2b44c 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -342,7 +342,8 @@ struct kvm_vcpu {
>  #ifndef __KVM_HAVE_ARCH_WQP
>         struct rcuwait wait;
>  #endif
> -       struct pid __rcu *pid;
> +       struct pid *pid;
> +       rwlock_t pid_lock;
>         int sigset_active;
>         sigset_t sigset;
>         unsigned int halt_poll_ns;
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 91048a7ad3be..fabffd85fa34 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -486,6 +486,7 @@ static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
>         vcpu->kvm = kvm;
>         vcpu->vcpu_id = id;
>         vcpu->pid = NULL;
> +       rwlock_init(&vcpu->pid_lock);
>  #ifndef __KVM_HAVE_ARCH_WQP
>         rcuwait_init(&vcpu->wait);
>  #endif
> @@ -513,7 +514,7 @@ static void kvm_vcpu_destroy(struct kvm_vcpu *vcpu)
>          * the vcpu->pid pointer, and at destruction time all file descriptors
>          * are already gone.
>          */
> -       put_pid(rcu_dereference_protected(vcpu->pid, 1));
> +       put_pid(vcpu->pid);
>
>         free_page((unsigned long)vcpu->run);
>         kmem_cache_free(kvm_vcpu_cache, vcpu);
> @@ -3930,15 +3931,17 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
>
>  int kvm_vcpu_yield_to(struct kvm_vcpu *target)
>  {
> -       struct pid *pid;
>         struct task_struct *task = NULL;
>         int ret;
>
> -       rcu_read_lock();
> -       pid = rcu_dereference(target->pid);
> -       if (pid)
> -               task = get_pid_task(pid, PIDTYPE_PID);
> -       rcu_read_unlock();
> +       if (!read_trylock(&target->pid_lock))
> +               return 0;
> +
> +       if (target->pid)
> +               task = get_pid_task(target->pid, PIDTYPE_PID);
> +
> +       read_unlock(&target->pid_lock);
> +
>         if (!task)
>                 return 0;
>         ret = yield_to(task, 1);
> @@ -4178,9 +4181,9 @@ static int vcpu_get_pid(void *data, u64 *val)
>  {
>         struct kvm_vcpu *vcpu = data;
>
> -       rcu_read_lock();
> -       *val = pid_nr(rcu_dereference(vcpu->pid));
> -       rcu_read_unlock();
> +       read_lock(&vcpu->pid_lock);
> +       *val = pid_nr(vcpu->pid);
> +       read_unlock(&vcpu->pid_lock);
>         return 0;
>  }
>
> @@ -4466,7 +4469,7 @@ static long kvm_vcpu_ioctl(struct file *filp,
>                 r = -EINVAL;
>                 if (arg)
>                         goto out;
> -               oldpid = rcu_access_pointer(vcpu->pid);
> +               oldpid = vcpu->pid;

Overall this patch looks correct, but this spot took me a moment, and
I want to confirm. This skips the reader lock since writing only
happens just below, under the vcpu lock, and we've already taken that
lock?

>                 if (unlikely(oldpid != task_pid(current))) {
>                         /* The thread running this VCPU changed. */
>                         struct pid *newpid;
> @@ -4476,9 +4479,10 @@ static long kvm_vcpu_ioctl(struct file *filp,
>                                 break;
>
>                         newpid = get_task_pid(current, PIDTYPE_PID);
> -                       rcu_assign_pointer(vcpu->pid, newpid);
> -                       if (oldpid)
> -                               synchronize_rcu();
> +                       write_lock(&vcpu->pid_lock);
> +                       vcpu->pid = newpid;
> +                       write_unlock(&vcpu->pid_lock);
> +
>                         put_pid(oldpid);
>                 }
>                 vcpu->wants_to_run = !READ_ONCE(vcpu->run->immediate_exit__unsafe);
> --
> 2.46.0.rc2.264.g509ed76dc8-goog
>
Sean Christopherson Aug. 2, 2024, 8:51 p.m. UTC | #2
On Fri, Aug 02, 2024, Steve Rutherford wrote:
> On Fri, Aug 2, 2024 at 1:01 PM Sean Christopherson <seanjc@google.com> wrote:
> > @@ -4178,9 +4181,9 @@ static int vcpu_get_pid(void *data, u64 *val)
> >  {
> >         struct kvm_vcpu *vcpu = data;
> >
> > -       rcu_read_lock();
> > -       *val = pid_nr(rcu_dereference(vcpu->pid));
> > -       rcu_read_unlock();
> > +       read_lock(&vcpu->pid_lock);
> > +       *val = pid_nr(vcpu->pid);
> > +       read_unlock(&vcpu->pid_lock);
> >         return 0;
> >  }
> >
> > @@ -4466,7 +4469,7 @@ static long kvm_vcpu_ioctl(struct file *filp,
> >                 r = -EINVAL;
> >                 if (arg)
> >                         goto out;
> > -               oldpid = rcu_access_pointer(vcpu->pid);
> > +               oldpid = vcpu->pid;
> 
> Overall this patch looks correct, but this spot took me a moment, and
> I want to confirm. This skips the reader lock since writing only
> happens just below, under the vcpu lock, and we've already taken that
> lock?

Yep, exactly.
Steve Rutherford Aug. 2, 2024, 9:27 p.m. UTC | #3
On Fri, Aug 2, 2024 at 1:01 PM Sean Christopherson <seanjc@google.com> wrote:
>
> To avoid jitter on KVM_RUN due to synchronize_rcu(), use a rwlock instead
> of RCU to protect vcpu->pid, a.k.a. the pid of the task last used to a
> vCPU.  When userspace is doing M:N scheduling of tasks to vCPUs, e.g. to
> run SEV migration helper vCPUs during post-copy, the synchronize_rcu()
> needed to change the PID associated with the vCPU can stall for hundreds
> of milliseconds, which is problematic for latency sensitive post-copy
> operations.
>
> In the directed yield path, do not acquire the lock if it's contended,
> i.e. if the associated PID is changing, as that means the vCPU's task is
> already running.
>
> Reported-by: Steve Rutherford <srutherford@google.com>
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> ---
>  arch/arm64/include/asm/kvm_host.h |  2 +-
>  include/linux/kvm_host.h          |  3 ++-
>  virt/kvm/kvm_main.c               | 32 +++++++++++++++++--------------
>  3 files changed, 21 insertions(+), 16 deletions(-)
>
> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> index a33f5996ca9f..7199cb014806 100644
> --- a/arch/arm64/include/asm/kvm_host.h
> +++ b/arch/arm64/include/asm/kvm_host.h
> @@ -1115,7 +1115,7 @@ int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu,
>  void kvm_arm_halt_guest(struct kvm *kvm);
>  void kvm_arm_resume_guest(struct kvm *kvm);
>
> -#define vcpu_has_run_once(vcpu)        !!rcu_access_pointer((vcpu)->pid)
> +#define vcpu_has_run_once(vcpu)        (!!READ_ONCE((vcpu)->pid))
>
>  #ifndef __KVM_NVHE_HYPERVISOR__
>  #define kvm_call_hyp_nvhe(f, ...)                                              \
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 689e8be873a7..d6f4e8b2b44c 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -342,7 +342,8 @@ struct kvm_vcpu {
>  #ifndef __KVM_HAVE_ARCH_WQP
>         struct rcuwait wait;
>  #endif
> -       struct pid __rcu *pid;
> +       struct pid *pid;
> +       rwlock_t pid_lock;
>         int sigset_active;
>         sigset_t sigset;
>         unsigned int halt_poll_ns;
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 91048a7ad3be..fabffd85fa34 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -486,6 +486,7 @@ static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
>         vcpu->kvm = kvm;
>         vcpu->vcpu_id = id;
>         vcpu->pid = NULL;
> +       rwlock_init(&vcpu->pid_lock);
>  #ifndef __KVM_HAVE_ARCH_WQP
>         rcuwait_init(&vcpu->wait);
>  #endif
> @@ -513,7 +514,7 @@ static void kvm_vcpu_destroy(struct kvm_vcpu *vcpu)
>          * the vcpu->pid pointer, and at destruction time all file descriptors
>          * are already gone.
>          */
> -       put_pid(rcu_dereference_protected(vcpu->pid, 1));
> +       put_pid(vcpu->pid);
>
>         free_page((unsigned long)vcpu->run);
>         kmem_cache_free(kvm_vcpu_cache, vcpu);
> @@ -3930,15 +3931,17 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
>
>  int kvm_vcpu_yield_to(struct kvm_vcpu *target)
>  {
> -       struct pid *pid;
>         struct task_struct *task = NULL;
>         int ret;
>
> -       rcu_read_lock();
> -       pid = rcu_dereference(target->pid);
> -       if (pid)
> -               task = get_pid_task(pid, PIDTYPE_PID);
> -       rcu_read_unlock();
> +       if (!read_trylock(&target->pid_lock))
> +               return 0;
> +
> +       if (target->pid)
> +               task = get_pid_task(target->pid, PIDTYPE_PID);
> +
> +       read_unlock(&target->pid_lock);
> +
>         if (!task)
>                 return 0;
>         ret = yield_to(task, 1);
> @@ -4178,9 +4181,9 @@ static int vcpu_get_pid(void *data, u64 *val)
>  {
>         struct kvm_vcpu *vcpu = data;
>
> -       rcu_read_lock();
> -       *val = pid_nr(rcu_dereference(vcpu->pid));
> -       rcu_read_unlock();
> +       read_lock(&vcpu->pid_lock);
> +       *val = pid_nr(vcpu->pid);
> +       read_unlock(&vcpu->pid_lock);
>         return 0;
>  }
>
> @@ -4466,7 +4469,7 @@ static long kvm_vcpu_ioctl(struct file *filp,
>                 r = -EINVAL;
>                 if (arg)
>                         goto out;
> -               oldpid = rcu_access_pointer(vcpu->pid);
> +               oldpid = vcpu->pid;
>                 if (unlikely(oldpid != task_pid(current))) {
>                         /* The thread running this VCPU changed. */
>                         struct pid *newpid;
> @@ -4476,9 +4479,10 @@ static long kvm_vcpu_ioctl(struct file *filp,
>                                 break;
>
>                         newpid = get_task_pid(current, PIDTYPE_PID);
> -                       rcu_assign_pointer(vcpu->pid, newpid);
> -                       if (oldpid)
> -                               synchronize_rcu();
> +                       write_lock(&vcpu->pid_lock);
> +                       vcpu->pid = newpid;
> +                       write_unlock(&vcpu->pid_lock);
> +
>                         put_pid(oldpid);
>                 }
>                 vcpu->wants_to_run = !READ_ONCE(vcpu->run->immediate_exit__unsafe);
> --
> 2.46.0.rc2.264.g509ed76dc8-goog
>
Reviewed-by: Steve Rutherford <srutherford@google.com>
Oliver Upton Aug. 6, 2024, 10:58 p.m. UTC | #4
On Fri, Aug 02, 2024 at 01:01:36PM -0700, Sean Christopherson wrote:
> To avoid jitter on KVM_RUN due to synchronize_rcu(), use a rwlock instead
> of RCU to protect vcpu->pid, a.k.a. the pid of the task last used to a
> vCPU.  When userspace is doing M:N scheduling of tasks to vCPUs, e.g. to
> run SEV migration helper vCPUs during post-copy, the synchronize_rcu()
> needed to change the PID associated with the vCPU can stall for hundreds
> of milliseconds, which is problematic for latency sensitive post-copy
> operations.
> 
> In the directed yield path, do not acquire the lock if it's contended,
> i.e. if the associated PID is changing, as that means the vCPU's task is
> already running.
> 
> Reported-by: Steve Rutherford <srutherford@google.com>
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> ---
>  arch/arm64/include/asm/kvm_host.h |  2 +-
>  include/linux/kvm_host.h          |  3 ++-
>  virt/kvm/kvm_main.c               | 32 +++++++++++++++++--------------
>  3 files changed, 21 insertions(+), 16 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> index a33f5996ca9f..7199cb014806 100644
> --- a/arch/arm64/include/asm/kvm_host.h
> +++ b/arch/arm64/include/asm/kvm_host.h
> @@ -1115,7 +1115,7 @@ int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu,
>  void kvm_arm_halt_guest(struct kvm *kvm);
>  void kvm_arm_resume_guest(struct kvm *kvm);
>  
> -#define vcpu_has_run_once(vcpu)	!!rcu_access_pointer((vcpu)->pid)
> +#define vcpu_has_run_once(vcpu)	(!!READ_ONCE((vcpu)->pid))
>  
>  #ifndef __KVM_NVHE_HYPERVISOR__
>  #define kvm_call_hyp_nvhe(f, ...)						\
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 689e8be873a7..d6f4e8b2b44c 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -342,7 +342,8 @@ struct kvm_vcpu {
>  #ifndef __KVM_HAVE_ARCH_WQP
>  	struct rcuwait wait;
>  #endif
> -	struct pid __rcu *pid;
> +	struct pid *pid;
> +	rwlock_t pid_lock;
>  	int sigset_active;
>  	sigset_t sigset;
>  	unsigned int halt_poll_ns;

Adding yet another lock is never exciting, but this looks fine. Can you
nest this lock inside of the vcpu->mutex acquisition in
kvm_vm_ioctl_create_vcpu() so lockdep gets the picture?


> @@ -4466,7 +4469,7 @@ static long kvm_vcpu_ioctl(struct file *filp,
>  		r = -EINVAL;
>  		if (arg)
>  			goto out;
> -		oldpid = rcu_access_pointer(vcpu->pid);
> +		oldpid = vcpu->pid;

It'd be good to add a comment here about how this is guarded by the
vcpu->mutex, as Steve points out.
Sean Christopherson Aug. 6, 2024, 11:59 p.m. UTC | #5
On Tue, Aug 06, 2024, Oliver Upton wrote:
> On Fri, Aug 02, 2024 at 01:01:36PM -0700, Sean Christopherson wrote:
> > diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> > index a33f5996ca9f..7199cb014806 100644
> > --- a/arch/arm64/include/asm/kvm_host.h
> > +++ b/arch/arm64/include/asm/kvm_host.h
> > @@ -1115,7 +1115,7 @@ int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu,
> >  void kvm_arm_halt_guest(struct kvm *kvm);
> >  void kvm_arm_resume_guest(struct kvm *kvm);
> >  
> > -#define vcpu_has_run_once(vcpu)	!!rcu_access_pointer((vcpu)->pid)
> > +#define vcpu_has_run_once(vcpu)	(!!READ_ONCE((vcpu)->pid))
> >  
> >  #ifndef __KVM_NVHE_HYPERVISOR__
> >  #define kvm_call_hyp_nvhe(f, ...)						\
> > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > index 689e8be873a7..d6f4e8b2b44c 100644
> > --- a/include/linux/kvm_host.h
> > +++ b/include/linux/kvm_host.h
> > @@ -342,7 +342,8 @@ struct kvm_vcpu {
> >  #ifndef __KVM_HAVE_ARCH_WQP
> >  	struct rcuwait wait;
> >  #endif
> > -	struct pid __rcu *pid;
> > +	struct pid *pid;
> > +	rwlock_t pid_lock;
> >  	int sigset_active;
> >  	sigset_t sigset;
> >  	unsigned int halt_poll_ns;
> 
> Adding yet another lock is never exciting, but this looks fine.

Heh, my feelings too.  Maybe that's why I didn't post this for two years.

> Can you nest this lock inside of the vcpu->mutex acquisition in
> kvm_vm_ioctl_create_vcpu() so lockdep gets the picture?

I don't think that's necessary.  Commit 42a90008f890 ("KVM: Ensure lockdep knows
about kvm->lock vs. vcpu->mutex ordering rule") added the lock+unlock in
kvm_vm_ioctl_create_vcpu() purely because actually taking vcpu->mutex inside
kvm->lock is rare, i.e. lockdep would be unable to detect issues except for very
specific VM types hitting very specific flows.

But for this lock, every arch is guaranteed to take the lock on the first KVM_RUN,
as "oldpid" is '0' and guaranteed to mismatch task_pid(current).  So I don't think
we go out of our way to alert lockdep.

> > @@ -4466,7 +4469,7 @@ static long kvm_vcpu_ioctl(struct file *filp,
> >  		r = -EINVAL;
> >  		if (arg)
> >  			goto out;
> > -		oldpid = rcu_access_pointer(vcpu->pid);
> > +		oldpid = vcpu->pid;
> 
> It'd be good to add a comment here about how this is guarded by the
> vcpu->mutex, as Steve points out.

Roger that.
Oliver Upton Aug. 9, 2024, 6:05 p.m. UTC | #6
On Tue, Aug 06, 2024 at 04:59:03PM -0700, Sean Christopherson wrote:
> > Can you nest this lock inside of the vcpu->mutex acquisition in
> > kvm_vm_ioctl_create_vcpu() so lockdep gets the picture?
> 
> I don't think that's necessary.  Commit 42a90008f890 ("KVM: Ensure lockdep knows
> about kvm->lock vs. vcpu->mutex ordering rule") added the lock+unlock in
> kvm_vm_ioctl_create_vcpu() purely because actually taking vcpu->mutex inside
> kvm->lock is rare, i.e. lockdep would be unable to detect issues except for very
> specific VM types hitting very specific flows.

I don't think the perceived rarity matters at all w/ this. Beyond the
lockdep benefits, it is a self-documenting way to describe lock ordering.
Dunno about you, but I haven't kept up with locking.rst at all :)

Having said that, an inversion would still be *very* obvious, as it
would be trying to grab a mutex while holding a spinlock...
Sean Christopherson Aug. 13, 2024, 2:05 a.m. UTC | #7
On Fri, Aug 09, 2024, Oliver Upton wrote:
> On Tue, Aug 06, 2024 at 04:59:03PM -0700, Sean Christopherson wrote:
> > > Can you nest this lock inside of the vcpu->mutex acquisition in
> > > kvm_vm_ioctl_create_vcpu() so lockdep gets the picture?
> > 
> > I don't think that's necessary.  Commit 42a90008f890 ("KVM: Ensure lockdep knows
> > about kvm->lock vs. vcpu->mutex ordering rule") added the lock+unlock in
> > kvm_vm_ioctl_create_vcpu() purely because actually taking vcpu->mutex inside
> > kvm->lock is rare, i.e. lockdep would be unable to detect issues except for very
> > specific VM types hitting very specific flows.
> 
> I don't think the perceived rarity matters at all w/ this.

Rarity definitely matters.  If KVM was splattered with code that takes vcpu->mutex
inside kvm->lock, then the mess that led to above commit likely would never had
happened.

> Beyond the lockdep benefits, it is a self-documenting way to describe lock
> ordering.

Lock acquisition alone won't suffice, many of the more unique locks in KVM need
comments/documentation, e.g. to explain additional rules, assumptions that make
things work, etc.  We could obviously add comments for everything, but I don't
see how that's clearly better than actual documentation.  E.g. pid_lock is taken
for read across vCPUs.  Acquiring vcpu->pid_lock inside vcpu->mutex doesn't
capture that at all.

It's also simply not realistic to enumerate every possible combination.  Many of
the combinations will likely never happen in practice, especially for spinlocks
since their usage is quite targeted.  Trying to document the "preferred" ordering
between the various spinlocks would be an exercise in futility as so many would
be 100% arbitrary due to lack of a use case.

KVM's mutexes are more interesting because they tend to be coarser, and thus are
more prone to nesting, so maybe we could have lockdep-enforced documentation for
those?  But if we want to do that, I think we should have a dedicated helper (and
possible arch hooks), not an ad hoc pile of locks in vCPU creation.

And we should have that discussion over here[*], because I was planning on posting
a patch to revert the lockdep-only lock "documentation".

[*] https://lore.kernel.org/all/ZrFYsSPaDWUHOl0N@google.com

> Dunno about you, but I haven't kept up with locking.rst at all :)

Heh, x86 has done a decent job of documenting its lock usage.  I would much rather
add an entry in locking.rst for this new lock than add a lock+unlock in vCPU
creation.  Especially since the usage is rather uncommon (guaranteed single writer,
readers are best-effort and cross-vCPU).

> Having said that, an inversion would still be *very* obvious, as it
> would be trying to grab a mutex while holding a spinlock...
> 
> -- 
> Thanks,
> Oliver
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index a33f5996ca9f..7199cb014806 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -1115,7 +1115,7 @@  int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu,
 void kvm_arm_halt_guest(struct kvm *kvm);
 void kvm_arm_resume_guest(struct kvm *kvm);
 
-#define vcpu_has_run_once(vcpu)	!!rcu_access_pointer((vcpu)->pid)
+#define vcpu_has_run_once(vcpu)	(!!READ_ONCE((vcpu)->pid))
 
 #ifndef __KVM_NVHE_HYPERVISOR__
 #define kvm_call_hyp_nvhe(f, ...)						\
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 689e8be873a7..d6f4e8b2b44c 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -342,7 +342,8 @@  struct kvm_vcpu {
 #ifndef __KVM_HAVE_ARCH_WQP
 	struct rcuwait wait;
 #endif
-	struct pid __rcu *pid;
+	struct pid *pid;
+	rwlock_t pid_lock;
 	int sigset_active;
 	sigset_t sigset;
 	unsigned int halt_poll_ns;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 91048a7ad3be..fabffd85fa34 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -486,6 +486,7 @@  static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
 	vcpu->kvm = kvm;
 	vcpu->vcpu_id = id;
 	vcpu->pid = NULL;
+	rwlock_init(&vcpu->pid_lock);
 #ifndef __KVM_HAVE_ARCH_WQP
 	rcuwait_init(&vcpu->wait);
 #endif
@@ -513,7 +514,7 @@  static void kvm_vcpu_destroy(struct kvm_vcpu *vcpu)
 	 * the vcpu->pid pointer, and at destruction time all file descriptors
 	 * are already gone.
 	 */
-	put_pid(rcu_dereference_protected(vcpu->pid, 1));
+	put_pid(vcpu->pid);
 
 	free_page((unsigned long)vcpu->run);
 	kmem_cache_free(kvm_vcpu_cache, vcpu);
@@ -3930,15 +3931,17 @@  EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
 
 int kvm_vcpu_yield_to(struct kvm_vcpu *target)
 {
-	struct pid *pid;
 	struct task_struct *task = NULL;
 	int ret;
 
-	rcu_read_lock();
-	pid = rcu_dereference(target->pid);
-	if (pid)
-		task = get_pid_task(pid, PIDTYPE_PID);
-	rcu_read_unlock();
+	if (!read_trylock(&target->pid_lock))
+		return 0;
+
+	if (target->pid)
+		task = get_pid_task(target->pid, PIDTYPE_PID);
+
+	read_unlock(&target->pid_lock);
+
 	if (!task)
 		return 0;
 	ret = yield_to(task, 1);
@@ -4178,9 +4181,9 @@  static int vcpu_get_pid(void *data, u64 *val)
 {
 	struct kvm_vcpu *vcpu = data;
 
-	rcu_read_lock();
-	*val = pid_nr(rcu_dereference(vcpu->pid));
-	rcu_read_unlock();
+	read_lock(&vcpu->pid_lock);
+	*val = pid_nr(vcpu->pid);
+	read_unlock(&vcpu->pid_lock);
 	return 0;
 }
 
@@ -4466,7 +4469,7 @@  static long kvm_vcpu_ioctl(struct file *filp,
 		r = -EINVAL;
 		if (arg)
 			goto out;
-		oldpid = rcu_access_pointer(vcpu->pid);
+		oldpid = vcpu->pid;
 		if (unlikely(oldpid != task_pid(current))) {
 			/* The thread running this VCPU changed. */
 			struct pid *newpid;
@@ -4476,9 +4479,10 @@  static long kvm_vcpu_ioctl(struct file *filp,
 				break;
 
 			newpid = get_task_pid(current, PIDTYPE_PID);
-			rcu_assign_pointer(vcpu->pid, newpid);
-			if (oldpid)
-				synchronize_rcu();
+			write_lock(&vcpu->pid_lock);
+			vcpu->pid = newpid;
+			write_unlock(&vcpu->pid_lock);
+
 			put_pid(oldpid);
 		}
 		vcpu->wants_to_run = !READ_ONCE(vcpu->run->immediate_exit__unsafe);