diff mbox

[v2,1/4] VMX: Properly handle pi when all the assigned devices are removed

Message ID 1464269954-8056-2-git-send-email-feng.wu@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Wu, Feng May 26, 2016, 1:39 p.m. UTC
This patch handles some concern case when the last assigned device
is removed from the domain. In this case we should carefully handle
pi descriptor and the per-cpu blocking list, to make sure:
- all the PI descriptor are in the right state when next time a
devices is assigned to the domain again. This is achrived by always
making all the pi hooks available, so the pi descriptor is updated
during scheduling, which make it always up-to-data.
- No remaining vcpus of the domain in the per-cpu blocking list.

Signed-off-by: Feng Wu <feng.wu@intel.com>
---
 xen/arch/x86/hvm/vmx/vmx.c         | 75 +++++++++++++++++++++++++++++++-------
 xen/include/asm-x86/hvm/vmx/vmcs.h |  3 ++
 2 files changed, 65 insertions(+), 13 deletions(-)

Comments

Jan Beulich May 27, 2016, 1:43 p.m. UTC | #1
>>> On 26.05.16 at 15:39, <feng.wu@intel.com> wrote:
> --- a/xen/arch/x86/hvm/vmx/vmx.c
> +++ b/xen/arch/x86/hvm/vmx/vmx.c
> @@ -113,7 +113,19 @@ static void vmx_vcpu_block(struct vcpu *v)
>  		&per_cpu(vmx_pi_blocking, v->processor).lock;
>      struct pi_desc *pi_desc = &v->arch.hvm_vmx.pi_desc;
>  
> -    spin_lock_irqsave(pi_blocking_list_lock, flags);
> +    spin_lock_irqsave(&v->arch.hvm_vmx.pi_hotplug_lock, flags);
> +    if ( unlikely(v->arch.hvm_vmx.pi_blocking_cleaned_up) )
> +    {
> +        /*
> +         * The vcpu is to be destroyed and it has already been removed
> +         * from the per-CPU list if it is blocking, we shouldn't add
> +         * new vCPU to the list.
> +         */

This comment appears to be stale wrt the implementation further
down. But see there for whether it's the comment or the code
that need to change.

> +        spin_unlock_irqrestore(&v->arch.hvm_vmx.pi_hotplug_lock, flags);
> +        return;
> +    }
> +
> +    spin_lock(pi_blocking_list_lock);

There doesn't appear to be an active problem with this, but
taking a global lock inside a per-vCPU one feels bad. Both here
and in vmx_pi_blocking_cleanup() I can't see why the global
lock alone won't do - you acquire it anyway.

> +static void vmx_pi_blocking_cleanup(struct vcpu *v)
> +{
> +    unsigned long flags;
> +    spinlock_t *pi_blocking_list_lock;
> +
> +    if ( !iommu_intpost )
> +        return;

If the function is to remain to be called from just the body of a loop
over all vCPU-s, please make that loop conditional upon iommu_intpost
instead of checking it here (and bailing) for every vCPU.

> +    spin_lock_irqsave(&v->arch.hvm_vmx.pi_hotplug_lock, flags);
> +    v->arch.hvm_vmx.pi_blocking_cleaned_up = 1;
> +
> +    pi_blocking_list_lock = v->arch.hvm_vmx.pi_blocking.lock;
> +    if (pi_blocking_list_lock == NULL)

Coding style.

> +    {
> +        spin_unlock_irqrestore(&v->arch.hvm_vmx.pi_hotplug_lock, flags);
> +        return;
> +    }
> +
> +    spin_lock(pi_blocking_list_lock);
> +    if ( v->arch.hvm_vmx.pi_blocking.lock != NULL )
> +    {
> +        ASSERT(v->arch.hvm_vmx.pi_blocking.lock == pi_blocking_list_lock);
> +        list_del(&v->arch.hvm_vmx.pi_blocking.list);
> +        v->arch.hvm_vmx.pi_blocking.lock = NULL;
> +    }
> +    spin_unlock(pi_blocking_list_lock);
> +    spin_unlock_irqrestore(&v->arch.hvm_vmx.pi_hotplug_lock, flags);
> +}
> +
>  /* This function is called when pcidevs_lock is held */
>  void vmx_pi_hooks_assign(struct domain *d)
>  {
> +    struct vcpu *v;
> +
>      if ( !iommu_intpost || !has_hvm_container_domain(d) )
>          return;
>  
> -    ASSERT(!d->arch.hvm_domain.vmx.vcpu_block);
> +    for_each_vcpu ( d, v )
> +        v->arch.hvm_vmx.pi_blocking_cleaned_up = 0;
>  
> -    d->arch.hvm_domain.vmx.vcpu_block = vmx_vcpu_block;
> -    d->arch.hvm_domain.vmx.pi_switch_from = vmx_pi_switch_from;
> -    d->arch.hvm_domain.vmx.pi_switch_to = vmx_pi_switch_to;
> -    d->arch.hvm_domain.vmx.pi_do_resume = vmx_pi_do_resume;
> +    if ( !d->arch.hvm_domain.vmx.vcpu_block )
> +    {
> +        d->arch.hvm_domain.vmx.vcpu_block = vmx_vcpu_block;
> +        d->arch.hvm_domain.vmx.pi_switch_from = vmx_pi_switch_from;
> +        d->arch.hvm_domain.vmx.pi_switch_to = vmx_pi_switch_to;
> +        d->arch.hvm_domain.vmx.pi_do_resume = vmx_pi_do_resume;
> +    }
>  }
>  
>  /* This function is called when pcidevs_lock is held */
>  void vmx_pi_hooks_deassign(struct domain *d)
>  {
> +    struct vcpu *v;
> +
>      if ( !iommu_intpost || !has_hvm_container_domain(d) )
>          return;
>  
> -    ASSERT(d->arch.hvm_domain.vmx.vcpu_block);
> -
> -    d->arch.hvm_domain.vmx.vcpu_block = NULL;
> -    d->arch.hvm_domain.vmx.pi_switch_from = NULL;
> -    d->arch.hvm_domain.vmx.pi_switch_to = NULL;
> -    d->arch.hvm_domain.vmx.pi_do_resume = NULL;
> +    for_each_vcpu ( d, v )
> +        vmx_pi_blocking_cleanup(v);

If you keep the hooks in place, why is it relevant to do the cleanup
here instead of just at domain death? As suggested before, if you
did it there, you'd likely get away without adding the new per-vCPU
flag.

Furthermore, if things remain the way they are now, a per-domain
flag would suffice.

And finally - do you really need to retain all four hooks? Since if not,
one that gets zapped here could be used in place of such a per-
domain flag.

> --- a/xen/include/asm-x86/hvm/vmx/vmcs.h
> +++ b/xen/include/asm-x86/hvm/vmx/vmcs.h
> @@ -231,6 +231,9 @@ struct arch_vmx_struct {
>       * pCPU and wakeup the related vCPU.
>       */
>      struct pi_blocking_vcpu pi_blocking;
> +
> +    spinlock_t            pi_hotplug_lock;

Being through all of the patch, I have a problem seeing the hotplug
nature of this.

> +    bool_t                pi_blocking_cleaned_up;

If this flag is to remain, it should move into its designated structure
(right above your addition).

Jan
Wu, Feng May 31, 2016, 10:22 a.m. UTC | #2
> -----Original Message-----
> From: Jan Beulich [mailto:JBeulich@suse.com]
> Sent: Friday, May 27, 2016 9:43 PM
> To: Wu, Feng <feng.wu@intel.com>
> Cc: andrew.cooper3@citrix.com; dario.faggioli@citrix.com;
> george.dunlap@eu.citrix.com; Tian, Kevin <kevin.tian@intel.com>; xen-
> devel@lists.xen.org; konrad.wilk@oracle.com; keir@xen.org
> Subject: Re: [PATCH v2 1/4] VMX: Properly handle pi when all the assigned
> devices are removed
> 
> >>> On 26.05.16 at 15:39, <feng.wu@intel.com> wrote:
> > --- a/xen/arch/x86/hvm/vmx/vmx.c
> > +++ b/xen/arch/x86/hvm/vmx/vmx.c
> > @@ -113,7 +113,19 @@ static void vmx_vcpu_block(struct vcpu *v)
> >  		&per_cpu(vmx_pi_blocking, v->processor).lock;
> >      struct pi_desc *pi_desc = &v->arch.hvm_vmx.pi_desc;
> >
> > -    spin_lock_irqsave(pi_blocking_list_lock, flags);
> > +    spin_lock_irqsave(&v->arch.hvm_vmx.pi_hotplug_lock, flags);
> > +    if ( unlikely(v->arch.hvm_vmx.pi_blocking_cleaned_up) )
> > +    {
> > +        /*
> > +         * The vcpu is to be destroyed and it has already been removed
> > +         * from the per-CPU list if it is blocking, we shouldn't add
> > +         * new vCPU to the list.
> > +         */
> 
> This comment appears to be stale wrt the implementation further
> down. But see there for whether it's the comment or the code
> that need to change.
> 
> > +        spin_unlock_irqrestore(&v->arch.hvm_vmx.pi_hotplug_lock, flags);
> > +        return;
> > +    }
> > +
> > +    spin_lock(pi_blocking_list_lock);
> 
> There doesn't appear to be an active problem with this, but
> taking a global lock inside a per-vCPU one feels bad. Both here
> and in vmx_pi_blocking_cleanup() I can't see why the global
> lock alone won't do - you acquire it anyway.

The purpose of ' v->arch.hvm_vmx.pi_hotplug_lock ' is to make
sure things go right when vmx_pi_blocking_cleanup() and
vmx_vcpu_block() are running concurrently. However, the lock
'pi_blocking_list_lock' in vmx_pi_blocking_cleanup() refers to
' v->arch.hvm_vmx.pi_blocking.lock ' while 'pi_blocking_list_lock'
in vmx_vcpu_block() is '&per_cpu(vmx_pi_blocking, v->processor).lock'.
These two can be different, please consider the following scenario:

vmx_pi_blocking_cleanup() gets called when the vcpu is in blocking
state and 'v->arch.hvm_vmx.pi_blocking.lock' points to the per-cpu
lock of pCPU0, and when acquiring the lock in this function, another
one wins, (such as vmx_pi_do_resume() or pi_wakeup_interrupt()),
so we are spinning in vmx_pi_blocking_cleanup(). After the vCPU
is woken up, it is running on pCPU1, then sometime later, the vCPU
is blocking again and in vmx_vcpu_block() and if the previous
vmx_pi_blocking_cleanup() still doesn't finish its job. Then we
acquire a different lock (it is pCPU1 this time)in vmx_vcpu_block()
compared to the one in vmx_pi_blocking_cleanup. This can break
things, since we might still put the vCPU to the per-cpu blocking list
after vCPU is destroyed or the last assigned device is detached.

> 
> > +static void vmx_pi_blocking_cleanup(struct vcpu *v)
> > +{
> > +    unsigned long flags;
> > +    spinlock_t *pi_blocking_list_lock;
> > +
> > +    if ( !iommu_intpost )
> > +        return;
> 
> If the function is to remain to be called from just the body of a loop
> over all vCPU-s, please make that loop conditional upon iommu_intpost
> instead of checking it here (and bailing) for every vCPU.
> 
> > +    spin_lock_irqsave(&v->arch.hvm_vmx.pi_hotplug_lock, flags);
> > +    v->arch.hvm_vmx.pi_blocking_cleaned_up = 1;
> > +
> > +    pi_blocking_list_lock = v->arch.hvm_vmx.pi_blocking.lock;
> > +    if (pi_blocking_list_lock == NULL)
> 
> Coding style.
> 
> > +    {
> > +        spin_unlock_irqrestore(&v->arch.hvm_vmx.pi_hotplug_lock, flags);
> > +        return;
> > +    }
> > +
> > +    spin_lock(pi_blocking_list_lock);
> > +    if ( v->arch.hvm_vmx.pi_blocking.lock != NULL )
> > +    {
> > +        ASSERT(v->arch.hvm_vmx.pi_blocking.lock == pi_blocking_list_lock);
> > +        list_del(&v->arch.hvm_vmx.pi_blocking.list);
> > +        v->arch.hvm_vmx.pi_blocking.lock = NULL;
> > +    }
> > +    spin_unlock(pi_blocking_list_lock);
> > +    spin_unlock_irqrestore(&v->arch.hvm_vmx.pi_hotplug_lock, flags);
> > +}
> > +
> >  /* This function is called when pcidevs_lock is held */
> >  void vmx_pi_hooks_assign(struct domain *d)
> >  {
> > +    struct vcpu *v;
> > +
> >      if ( !iommu_intpost || !has_hvm_container_domain(d) )
> >          return;
> >
> > -    ASSERT(!d->arch.hvm_domain.vmx.vcpu_block);
> > +    for_each_vcpu ( d, v )
> > +        v->arch.hvm_vmx.pi_blocking_cleaned_up = 0;
> >
> > -    d->arch.hvm_domain.vmx.vcpu_block = vmx_vcpu_block;
> > -    d->arch.hvm_domain.vmx.pi_switch_from = vmx_pi_switch_from;
> > -    d->arch.hvm_domain.vmx.pi_switch_to = vmx_pi_switch_to;
> > -    d->arch.hvm_domain.vmx.pi_do_resume = vmx_pi_do_resume;
> > +    if ( !d->arch.hvm_domain.vmx.vcpu_block )
> > +    {
> > +        d->arch.hvm_domain.vmx.vcpu_block = vmx_vcpu_block;
> > +        d->arch.hvm_domain.vmx.pi_switch_from = vmx_pi_switch_from;
> > +        d->arch.hvm_domain.vmx.pi_switch_to = vmx_pi_switch_to;
> > +        d->arch.hvm_domain.vmx.pi_do_resume = vmx_pi_do_resume;
> > +    }
> >  }
> >
> >  /* This function is called when pcidevs_lock is held */
> >  void vmx_pi_hooks_deassign(struct domain *d)
> >  {
> > +    struct vcpu *v;
> > +
> >      if ( !iommu_intpost || !has_hvm_container_domain(d) )
> >          return;
> >
> > -    ASSERT(d->arch.hvm_domain.vmx.vcpu_block);
> > -
> > -    d->arch.hvm_domain.vmx.vcpu_block = NULL;
> > -    d->arch.hvm_domain.vmx.pi_switch_from = NULL;
> > -    d->arch.hvm_domain.vmx.pi_switch_to = NULL;
> > -    d->arch.hvm_domain.vmx.pi_do_resume = NULL;
> > +    for_each_vcpu ( d, v )
> > +        vmx_pi_blocking_cleanup(v);
> 
> If you keep the hooks in place, why is it relevant to do the cleanup
> here instead of just at domain death? As suggested before, if you
> did it there, you'd likely get away without adding the new per-vCPU
> flag.

I don't quite understand this. Adding the cleanup here is to handle
the corner case when the last assigned device is detached from the
domain. Why do you think we don't need to per-vCPU flag, we need
to set it here to indicate that the vCPU is cleaned up, and in
vmx_vcpu_block(), we cannot put the vCPUs with this flag set to the
per-cpu blocking list. Do I miss something?

> 
> Furthermore, if things remain the way they are now, a per-domain
> flag would suffice.

vmx_pi_blocking_cleanup() is called in vmx_cpu_destroy(), which can
be called during domain destroy or vcpu_initialise() if it meets some
errors. For the latter case, if we use per-domain flag and set it in
vmx_pi_blocking_clean(), that should be problematic, since it will
affect another vcpus which should be running normally.

> 
> And finally - do you really need to retain all four hooks? Since if not,
> one that gets zapped here could be used in place of such a per-
> domain flag.

Can you elaborate a bit more about this? thanks a lot!

> 
> > --- a/xen/include/asm-x86/hvm/vmx/vmcs.h
> > +++ b/xen/include/asm-x86/hvm/vmx/vmcs.h
> > @@ -231,6 +231,9 @@ struct arch_vmx_struct {
> >       * pCPU and wakeup the related vCPU.
> >       */
> >      struct pi_blocking_vcpu pi_blocking;
> > +
> > +    spinlock_t            pi_hotplug_lock;
> 
> Being through all of the patch, I have a problem seeing the hotplug
> nature of this.

This is related to the assigned device hotplug.

Thanks,
Feng

> 
> > +    bool_t                pi_blocking_cleaned_up;
> 
> If this flag is to remain, it should move into its designated structure
> (right above your addition).
> 
> Jan
Jan Beulich May 31, 2016, 11:52 a.m. UTC | #3
>>> On 31.05.16 at 12:22, <feng.wu@intel.com> wrote:
>> From: Jan Beulich [mailto:JBeulich@suse.com]
>> Sent: Friday, May 27, 2016 9:43 PM
>> >>> On 26.05.16 at 15:39, <feng.wu@intel.com> wrote:
>> > --- a/xen/arch/x86/hvm/vmx/vmx.c
>> > +++ b/xen/arch/x86/hvm/vmx/vmx.c
>> > @@ -113,7 +113,19 @@ static void vmx_vcpu_block(struct vcpu *v)
>> >  		&per_cpu(vmx_pi_blocking, v->processor).lock;
>> >      struct pi_desc *pi_desc = &v->arch.hvm_vmx.pi_desc;
>> >
>> > -    spin_lock_irqsave(pi_blocking_list_lock, flags);
>> > +    spin_lock_irqsave(&v->arch.hvm_vmx.pi_hotplug_lock, flags);
>> > +    if ( unlikely(v->arch.hvm_vmx.pi_blocking_cleaned_up) )
>> > +    {
>> > +        /*
>> > +         * The vcpu is to be destroyed and it has already been removed
>> > +         * from the per-CPU list if it is blocking, we shouldn't add
>> > +         * new vCPU to the list.
>> > +         */
>> 
>> This comment appears to be stale wrt the implementation further
>> down. But see there for whether it's the comment or the code
>> that need to change.
>> 
>> > +        spin_unlock_irqrestore(&v->arch.hvm_vmx.pi_hotplug_lock, flags);
>> > +        return;
>> > +    }
>> > +
>> > +    spin_lock(pi_blocking_list_lock);
>> 
>> There doesn't appear to be an active problem with this, but
>> taking a global lock inside a per-vCPU one feels bad. Both here
>> and in vmx_pi_blocking_cleanup() I can't see why the global
>> lock alone won't do - you acquire it anyway.
> 
> The purpose of ' v->arch.hvm_vmx.pi_hotplug_lock ' is to make
> sure things go right when vmx_pi_blocking_cleanup() and
> vmx_vcpu_block() are running concurrently. However, the lock
> 'pi_blocking_list_lock' in vmx_pi_blocking_cleanup() refers to
> ' v->arch.hvm_vmx.pi_blocking.lock ' while 'pi_blocking_list_lock'
> in vmx_vcpu_block() is '&per_cpu(vmx_pi_blocking, v->processor).lock'.
> These two can be different, please consider the following scenario:
> 
> vmx_pi_blocking_cleanup() gets called when the vcpu is in blocking
> state and 'v->arch.hvm_vmx.pi_blocking.lock' points to the per-cpu
> lock of pCPU0, and when acquiring the lock in this function, another
> one wins, (such as vmx_pi_do_resume() or pi_wakeup_interrupt()),
> so we are spinning in vmx_pi_blocking_cleanup(). After the vCPU
> is woken up, it is running on pCPU1, then sometime later, the vCPU
> is blocking again and in vmx_vcpu_block() and if the previous
> vmx_pi_blocking_cleanup() still doesn't finish its job. Then we
> acquire a different lock (it is pCPU1 this time)in vmx_vcpu_block()
> compared to the one in vmx_pi_blocking_cleanup. This can break
> things, since we might still put the vCPU to the per-cpu blocking list
> after vCPU is destroyed or the last assigned device is detached.

Makes sense. Then the next question is whether you really need
to hold both locks at the same time. Please understand that I'd
really prefer for this to have a simpler solution - the original code
was already more complicated than one would really think it should
be, and now it's getting worse. While I can't immediately suggest
alternatives, it feels like the solution to the current problem may
rather be simplification instead of making things even more
complicated.

>> >  void vmx_pi_hooks_deassign(struct domain *d)
>> >  {
>> > +    struct vcpu *v;
>> > +
>> >      if ( !iommu_intpost || !has_hvm_container_domain(d) )
>> >          return;
>> >
>> > -    ASSERT(d->arch.hvm_domain.vmx.vcpu_block);
>> > -
>> > -    d->arch.hvm_domain.vmx.vcpu_block = NULL;
>> > -    d->arch.hvm_domain.vmx.pi_switch_from = NULL;
>> > -    d->arch.hvm_domain.vmx.pi_switch_to = NULL;
>> > -    d->arch.hvm_domain.vmx.pi_do_resume = NULL;
>> > +    for_each_vcpu ( d, v )
>> > +        vmx_pi_blocking_cleanup(v);
>> 
>> If you keep the hooks in place, why is it relevant to do the cleanup
>> here instead of just at domain death? As suggested before, if you
>> did it there, you'd likely get away without adding the new per-vCPU
>> flag.
> 
> I don't quite understand this. Adding the cleanup here is to handle
> the corner case when the last assigned device is detached from the
> domain.

There's nothing to be cleaned up really if that de-assign isn't a
result of the domain being brought down.

> Why do you think we don't need to per-vCPU flag, we need
> to set it here to indicate that the vCPU is cleaned up, and in
> vmx_vcpu_block(), we cannot put the vCPUs with this flag set to the
> per-cpu blocking list. Do I miss something?

When clean up is done only at domain destruction time, there's
per-domain state already that can be checked instead of this
per-vCPU flag.

>> Furthermore, if things remain the way they are now, a per-domain
>> flag would suffice.
> 
> vmx_pi_blocking_cleanup() is called in vmx_cpu_destroy(), which can
> be called during domain destroy or vcpu_initialise() if it meets some
> errors. For the latter case, if we use per-domain flag and set it in
> vmx_pi_blocking_clean(), that should be problematic, since it will
> affect another vcpus which should be running normally.

I don't see how the error case of vcpu_initialise() can be problematic.
Any such vCPU would never run, and hence never want to block.

>> And finally - do you really need to retain all four hooks? Since if not,
>> one that gets zapped here could be used in place of such a per-
>> domain flag.
> 
> Can you elaborate a bit more about this? thanks a lot!

I have a really hard time what more I can say here. I dislike
redundant information, and hence if the flag value can be
deduced from some other field, I'd rather see that other field
used instead of yet another flag getting added.

>> > --- a/xen/include/asm-x86/hvm/vmx/vmcs.h
>> > +++ b/xen/include/asm-x86/hvm/vmx/vmcs.h
>> > @@ -231,6 +231,9 @@ struct arch_vmx_struct {
>> >       * pCPU and wakeup the related vCPU.
>> >       */
>> >      struct pi_blocking_vcpu pi_blocking;
>> > +
>> > +    spinlock_t            pi_hotplug_lock;
>> 
>> Being through all of the patch, I have a problem seeing the hotplug
>> nature of this.
> 
> This is related to the assigned device hotplug.

This reply means nothing to me. As said - I'm missing the connection
between this name and the rest of the patch (where there is no
further talk of hotplug afair).

Jan
Wu, Feng June 3, 2016, 5:12 a.m. UTC | #4
> -----Original Message-----
> From: Jan Beulich [mailto:JBeulich@suse.com]
> Sent: Tuesday, May 31, 2016 7:52 PM
> To: Wu, Feng <feng.wu@intel.com>
> Cc: andrew.cooper3@citrix.com; dario.faggioli@citrix.com;
> george.dunlap@eu.citrix.com; Tian, Kevin <kevin.tian@intel.com>; xen-
> devel@lists.xen.org; konrad.wilk@oracle.com; keir@xen.org
> Subject: RE: [PATCH v2 1/4] VMX: Properly handle pi when all the assigned
> devices are removed
> 
> >>> On 31.05.16 at 12:22, <feng.wu@intel.com> wrote:
> >> From: Jan Beulich [mailto:JBeulich@suse.com]
> >> Sent: Friday, May 27, 2016 9:43 PM
> >> >>> On 26.05.16 at 15:39, <feng.wu@intel.com> wrote:
> >> > --- a/xen/arch/x86/hvm/vmx/vmx.c
> >> > +++ b/xen/arch/x86/hvm/vmx/vmx.c
> >> > @@ -113,7 +113,19 @@ static void vmx_vcpu_block(struct vcpu *v)
> >> >  		&per_cpu(vmx_pi_blocking, v->processor).lock;
> >> >      struct pi_desc *pi_desc = &v->arch.hvm_vmx.pi_desc;
> >> >
> >> > -    spin_lock_irqsave(pi_blocking_list_lock, flags);
> >> > +    spin_lock_irqsave(&v->arch.hvm_vmx.pi_hotplug_lock, flags);
> >> > +    if ( unlikely(v->arch.hvm_vmx.pi_blocking_cleaned_up) )
> >> > +    {
> >> > +        /*
> >> > +         * The vcpu is to be destroyed and it has already been removed
> >> > +         * from the per-CPU list if it is blocking, we shouldn't add
> >> > +         * new vCPU to the list.
> >> > +         */
> >>
> >> This comment appears to be stale wrt the implementation further
> >> down. But see there for whether it's the comment or the code
> >> that need to change.
> >>
> >> > +        spin_unlock_irqrestore(&v->arch.hvm_vmx.pi_hotplug_lock, flags);
> >> > +        return;
> >> > +    }
> >> > +
> >> > +    spin_lock(pi_blocking_list_lock);
> >>
> >> There doesn't appear to be an active problem with this, but
> >> taking a global lock inside a per-vCPU one feels bad. Both here
> >> and in vmx_pi_blocking_cleanup() I can't see why the global
> >> lock alone won't do - you acquire it anyway.
> >
> > The purpose of ' v->arch.hvm_vmx.pi_hotplug_lock ' is to make
> > sure things go right when vmx_pi_blocking_cleanup() and
> > vmx_vcpu_block() are running concurrently. However, the lock
> > 'pi_blocking_list_lock' in vmx_pi_blocking_cleanup() refers to
> > ' v->arch.hvm_vmx.pi_blocking.lock ' while 'pi_blocking_list_lock'
> > in vmx_vcpu_block() is '&per_cpu(vmx_pi_blocking, v->processor).lock'.
> > These two can be different, please consider the following scenario:
> >
> > vmx_pi_blocking_cleanup() gets called when the vcpu is in blocking
> > state and 'v->arch.hvm_vmx.pi_blocking.lock' points to the per-cpu
> > lock of pCPU0, and when acquiring the lock in this function, another
> > one wins, (such as vmx_pi_do_resume() or pi_wakeup_interrupt()),
> > so we are spinning in vmx_pi_blocking_cleanup(). After the vCPU
> > is woken up, it is running on pCPU1, then sometime later, the vCPU
> > is blocking again and in vmx_vcpu_block() and if the previous
> > vmx_pi_blocking_cleanup() still doesn't finish its job. Then we
> > acquire a different lock (it is pCPU1 this time)in vmx_vcpu_block()
> > compared to the one in vmx_pi_blocking_cleanup. This can break
> > things, since we might still put the vCPU to the per-cpu blocking list
> > after vCPU is destroyed or the last assigned device is detached.
> 
> Makes sense. Then the next question is whether you really need
> to hold both locks at the same time.

I think we need to hold both. The two spinlocks have different purpose:
'v->arch.hvm_vmx.pi_hotplug_lock' is to protect some race case while
device hotplug or the domain is being down, while the other one is
used to normally protect accesses to the blocking list.

> Please understand that I'd
> really prefer for this to have a simpler solution - the original code
> was already more complicated than one would really think it should
> be, and now it's getting worse. While I can't immediately suggest
> alternatives, it feels like the solution to the current problem may
> rather be simplification instead of making things even more
> complicated.

To be honest, comments like this make one frustrated. If you have
any other better solutions, we can discuss it here. However, just
saying the current solution is too complicated is not a good way
to speed up the process.

> 
> >> >  void vmx_pi_hooks_deassign(struct domain *d)
> >> >  {
> >> > +    struct vcpu *v;
> >> > +
> >> >      if ( !iommu_intpost || !has_hvm_container_domain(d) )
> >> >          return;
> >> >
> >> > -    ASSERT(d->arch.hvm_domain.vmx.vcpu_block);
> >> > -
> >> > -    d->arch.hvm_domain.vmx.vcpu_block = NULL;
> >> > -    d->arch.hvm_domain.vmx.pi_switch_from = NULL;
> >> > -    d->arch.hvm_domain.vmx.pi_switch_to = NULL;
> >> > -    d->arch.hvm_domain.vmx.pi_do_resume = NULL;
> >> > +    for_each_vcpu ( d, v )
> >> > +        vmx_pi_blocking_cleanup(v);
> >>
> >> If you keep the hooks in place, why is it relevant to do the cleanup
> >> here instead of just at domain death? As suggested before, if you
> >> did it there, you'd likely get away without adding the new per-vCPU
> >> flag.
> >
> > I don't quite understand this. Adding the cleanup here is to handle
> > the corner case when the last assigned device is detached from the
> > domain.
> 
> There's nothing to be cleaned up really if that de-assign isn't a
> result of the domain being brought down.

I mentioned it clearly in [0/4] of this series. We _need_ to cleanup
the blocking list when removing the device while the domain
is running, since vmx_vcpu_block() might be running at the same
time. If that is the case, there might be some stale vcpus in the
blocking list.

> 
> > Why do you think we don't need to per-vCPU flag, we need
> > to set it here to indicate that the vCPU is cleaned up, and in
> > vmx_vcpu_block(), we cannot put the vCPUs with this flag set to the
> > per-cpu blocking list. Do I miss something?
> 
> When clean up is done only at domain destruction time, 

No.

Thanks,
Feng

> there's per-domain state already that can be checked instead of this
> per-vCPU flag.
> 
> >> Furthermore, if things remain the way they are now, a per-domain
> >> flag would suffice.
> >
> > vmx_pi_blocking_cleanup() is called in vmx_cpu_destroy(), which can
> > be called during domain destroy or vcpu_initialise() if it meets some
> > errors. For the latter case, if we use per-domain flag and set it in
> > vmx_pi_blocking_clean(), that should be problematic, since it will
> > affect another vcpus which should be running normally.
> 
> I don't see how the error case of vcpu_initialise() can be problematic.
> Any such vCPU would never run, and hence never want to block.
> 
> >> And finally - do you really need to retain all four hooks? Since if not,
> >> one that gets zapped here could be used in place of such a per-
> >> domain flag.
> >
> > Can you elaborate a bit more about this? thanks a lot!
> 
> I have a really hard time what more I can say here. I dislike
> redundant information, and hence if the flag value can be
> deduced from some other field, I'd rather see that other field
> used instead of yet another flag getting added.
> 
> >> > --- a/xen/include/asm-x86/hvm/vmx/vmcs.h
> >> > +++ b/xen/include/asm-x86/hvm/vmx/vmcs.h
> >> > @@ -231,6 +231,9 @@ struct arch_vmx_struct {
> >> >       * pCPU and wakeup the related vCPU.
> >> >       */
> >> >      struct pi_blocking_vcpu pi_blocking;
> >> > +
> >> > +    spinlock_t            pi_hotplug_lock;
> >>
> >> Being through all of the patch, I have a problem seeing the hotplug
> >> nature of this.
> >
> > This is related to the assigned device hotplug.
> 
> This reply means nothing to me. As said - I'm missing the connection
> between this name and the rest of the patch (where there is no
> further talk of hotplug afair).
> 
> Jan
Jan Beulich June 3, 2016, 9:45 a.m. UTC | #5
>>> On 03.06.16 at 07:12, <feng.wu@intel.com> wrote:
>> From: Jan Beulich [mailto:JBeulich@suse.com]
>> Sent: Tuesday, May 31, 2016 7:52 PM
>> >>> On 31.05.16 at 12:22, <feng.wu@intel.com> wrote:
>> >> From: Jan Beulich [mailto:JBeulich@suse.com]
>> >> Sent: Friday, May 27, 2016 9:43 PM
>> >> >>> On 26.05.16 at 15:39, <feng.wu@intel.com> wrote:
>> >> > --- a/xen/arch/x86/hvm/vmx/vmx.c
>> >> > +++ b/xen/arch/x86/hvm/vmx/vmx.c
>> >> > @@ -113,7 +113,19 @@ static void vmx_vcpu_block(struct vcpu *v)
>> >> >  		&per_cpu(vmx_pi_blocking, v->processor).lock;
>> >> >      struct pi_desc *pi_desc = &v->arch.hvm_vmx.pi_desc;
>> >> >
>> >> > -    spin_lock_irqsave(pi_blocking_list_lock, flags);
>> >> > +    spin_lock_irqsave(&v->arch.hvm_vmx.pi_hotplug_lock, flags);
>> >> > +    if ( unlikely(v->arch.hvm_vmx.pi_blocking_cleaned_up) )
>> >> > +    {
>> >> > +        /*
>> >> > +         * The vcpu is to be destroyed and it has already been removed
>> >> > +         * from the per-CPU list if it is blocking, we shouldn't add
>> >> > +         * new vCPU to the list.
>> >> > +         */
>> >> > +        spin_unlock_irqrestore(&v->arch.hvm_vmx.pi_hotplug_lock, flags);
>> >> > +        return;
>> >> > +    }
>> >> > +
>> >> > +    spin_lock(pi_blocking_list_lock);
>> >>
>> >> There doesn't appear to be an active problem with this, but
>> >> taking a global lock inside a per-vCPU one feels bad. Both here
>> >> and in vmx_pi_blocking_cleanup() I can't see why the global
>> >> lock alone won't do - you acquire it anyway.
>> >
>> > The purpose of ' v->arch.hvm_vmx.pi_hotplug_lock ' is to make
>> > sure things go right when vmx_pi_blocking_cleanup() and
>> > vmx_vcpu_block() are running concurrently. However, the lock
>> > 'pi_blocking_list_lock' in vmx_pi_blocking_cleanup() refers to
>> > ' v->arch.hvm_vmx.pi_blocking.lock ' while 'pi_blocking_list_lock'
>> > in vmx_vcpu_block() is '&per_cpu(vmx_pi_blocking, v->processor).lock'.
>> > These two can be different, please consider the following scenario:
>> >
>> > vmx_pi_blocking_cleanup() gets called when the vcpu is in blocking
>> > state and 'v->arch.hvm_vmx.pi_blocking.lock' points to the per-cpu
>> > lock of pCPU0, and when acquiring the lock in this function, another
>> > one wins, (such as vmx_pi_do_resume() or pi_wakeup_interrupt()),
>> > so we are spinning in vmx_pi_blocking_cleanup(). After the vCPU
>> > is woken up, it is running on pCPU1, then sometime later, the vCPU
>> > is blocking again and in vmx_vcpu_block() and if the previous
>> > vmx_pi_blocking_cleanup() still doesn't finish its job. Then we
>> > acquire a different lock (it is pCPU1 this time)in vmx_vcpu_block()
>> > compared to the one in vmx_pi_blocking_cleanup. This can break
>> > things, since we might still put the vCPU to the per-cpu blocking list
>> > after vCPU is destroyed or the last assigned device is detached.
>> 
>> Makes sense. Then the next question is whether you really need
>> to hold both locks at the same time.
> 
> I think we need to hold both. The two spinlocks have different purpose:
> 'v->arch.hvm_vmx.pi_hotplug_lock' is to protect some race case while
> device hotplug or the domain is being down, while the other one is
> used to normally protect accesses to the blocking list.

I understand they have different purposes, but that doesn't mean
they need to be held at the same time. In particular (looking back
at the full patch), the newly added lock frames not only the newly
added code, but also the code inside the other locked region. The
natural thing to expect would be that you need one lock for that
new code, and the other for the pre-existing one. I.e. the function
would still acquire both locks, but not one inside the other.

>> Please understand that I'd
>> really prefer for this to have a simpler solution - the original code
>> was already more complicated than one would really think it should
>> be, and now it's getting worse. While I can't immediately suggest
>> alternatives, it feels like the solution to the current problem may
>> rather be simplification instead of making things even more
>> complicated.
> 
> To be honest, comments like this make one frustrated. If you have
> any other better solutions, we can discuss it here. However, just
> saying the current solution is too complicated is not a good way
> to speed up the process.

I can understand your frustration. But please understand that I'm
also getting frustrated - by more and more difficult to maintain code
getting added. Please understand that addressing the immediate
problem is only one aspect; another is the long term maintainability
of whatever gets added for that purpose. But see below.

It is a more general observation of mine (also, just fyi, in my own
code) that if complicated things need even more complicated fixes,
then quite often something is wrong with the original complicated
arrangements / design.

>> >> >  void vmx_pi_hooks_deassign(struct domain *d)
>> >> >  {
>> >> > +    struct vcpu *v;
>> >> > +
>> >> >      if ( !iommu_intpost || !has_hvm_container_domain(d) )
>> >> >          return;
>> >> >
>> >> > -    ASSERT(d->arch.hvm_domain.vmx.vcpu_block);
>> >> > -
>> >> > -    d->arch.hvm_domain.vmx.vcpu_block = NULL;
>> >> > -    d->arch.hvm_domain.vmx.pi_switch_from = NULL;
>> >> > -    d->arch.hvm_domain.vmx.pi_switch_to = NULL;
>> >> > -    d->arch.hvm_domain.vmx.pi_do_resume = NULL;
>> >> > +    for_each_vcpu ( d, v )
>> >> > +        vmx_pi_blocking_cleanup(v);
>> >>
>> >> If you keep the hooks in place, why is it relevant to do the cleanup
>> >> here instead of just at domain death? As suggested before, if you
>> >> did it there, you'd likely get away without adding the new per-vCPU
>> >> flag.
>> >
>> > I don't quite understand this. Adding the cleanup here is to handle
>> > the corner case when the last assigned device is detached from the
>> > domain.
>> 
>> There's nothing to be cleaned up really if that de-assign isn't a
>> result of the domain being brought down.
> 
> I mentioned it clearly in [0/4] of this series. We _need_ to cleanup
> the blocking list when removing the device while the domain
> is running, since vmx_vcpu_block() might be running at the same
> time. If that is the case, there might be some stale vcpus in the
> blocking list.

I have to admit that the description there is far from clear to me,
even more so in the specific regard being discussed here.

Anyway - if there are problems here, did you consider simply
broadcasting a PI wakeup interrupt after device removal (and
after having "manually" set the condition to satisfy the
pi_test_on() in pi_wakeup_interrupt())? It would seem to me that
this would leverage existing code without adding new complicated
logic. But of course I may be overlooking something.

Jan
diff mbox

Patch

diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index bc4410f..65f5288 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -113,7 +113,19 @@  static void vmx_vcpu_block(struct vcpu *v)
 		&per_cpu(vmx_pi_blocking, v->processor).lock;
     struct pi_desc *pi_desc = &v->arch.hvm_vmx.pi_desc;
 
-    spin_lock_irqsave(pi_blocking_list_lock, flags);
+    spin_lock_irqsave(&v->arch.hvm_vmx.pi_hotplug_lock, flags);
+    if ( unlikely(v->arch.hvm_vmx.pi_blocking_cleaned_up) )
+    {
+        /*
+         * The vcpu is to be destroyed and it has already been removed
+         * from the per-CPU list if it is blocking, we shouldn't add
+         * new vCPU to the list.
+         */
+        spin_unlock_irqrestore(&v->arch.hvm_vmx.pi_hotplug_lock, flags);
+        return;
+    }
+
+    spin_lock(pi_blocking_list_lock);
     old_lock = cmpxchg(&v->arch.hvm_vmx.pi_blocking.lock, NULL,
                        pi_blocking_list_lock);
 
@@ -126,7 +138,9 @@  static void vmx_vcpu_block(struct vcpu *v)
 
     list_add_tail(&v->arch.hvm_vmx.pi_blocking.list,
                   &per_cpu(vmx_pi_blocking, v->processor).list);
-    spin_unlock_irqrestore(pi_blocking_list_lock, flags);
+    spin_unlock(pi_blocking_list_lock);
+
+    spin_unlock_irqrestore(&v->arch.hvm_vmx.pi_hotplug_lock, flags);
 
     ASSERT(!pi_test_sn(pi_desc));
 
@@ -199,32 +213,65 @@  static void vmx_pi_do_resume(struct vcpu *v)
     spin_unlock_irqrestore(pi_blocking_list_lock, flags);
 }
 
+static void vmx_pi_blocking_cleanup(struct vcpu *v)
+{
+    unsigned long flags;
+    spinlock_t *pi_blocking_list_lock;
+
+    if ( !iommu_intpost )
+        return;
+
+    spin_lock_irqsave(&v->arch.hvm_vmx.pi_hotplug_lock, flags);
+    v->arch.hvm_vmx.pi_blocking_cleaned_up = 1;
+
+    pi_blocking_list_lock = v->arch.hvm_vmx.pi_blocking.lock;
+    if (pi_blocking_list_lock == NULL)
+    {
+        spin_unlock_irqrestore(&v->arch.hvm_vmx.pi_hotplug_lock, flags);
+        return;
+    }
+
+    spin_lock(pi_blocking_list_lock);
+    if ( v->arch.hvm_vmx.pi_blocking.lock != NULL )
+    {
+        ASSERT(v->arch.hvm_vmx.pi_blocking.lock == pi_blocking_list_lock);
+        list_del(&v->arch.hvm_vmx.pi_blocking.list);
+        v->arch.hvm_vmx.pi_blocking.lock = NULL;
+    }
+    spin_unlock(pi_blocking_list_lock);
+    spin_unlock_irqrestore(&v->arch.hvm_vmx.pi_hotplug_lock, flags);
+}
+
 /* This function is called when pcidevs_lock is held */
 void vmx_pi_hooks_assign(struct domain *d)
 {
+    struct vcpu *v;
+
     if ( !iommu_intpost || !has_hvm_container_domain(d) )
         return;
 
-    ASSERT(!d->arch.hvm_domain.vmx.vcpu_block);
+    for_each_vcpu ( d, v )
+        v->arch.hvm_vmx.pi_blocking_cleaned_up = 0;
 
-    d->arch.hvm_domain.vmx.vcpu_block = vmx_vcpu_block;
-    d->arch.hvm_domain.vmx.pi_switch_from = vmx_pi_switch_from;
-    d->arch.hvm_domain.vmx.pi_switch_to = vmx_pi_switch_to;
-    d->arch.hvm_domain.vmx.pi_do_resume = vmx_pi_do_resume;
+    if ( !d->arch.hvm_domain.vmx.vcpu_block )
+    {
+        d->arch.hvm_domain.vmx.vcpu_block = vmx_vcpu_block;
+        d->arch.hvm_domain.vmx.pi_switch_from = vmx_pi_switch_from;
+        d->arch.hvm_domain.vmx.pi_switch_to = vmx_pi_switch_to;
+        d->arch.hvm_domain.vmx.pi_do_resume = vmx_pi_do_resume;
+    }
 }
 
 /* This function is called when pcidevs_lock is held */
 void vmx_pi_hooks_deassign(struct domain *d)
 {
+    struct vcpu *v;
+
     if ( !iommu_intpost || !has_hvm_container_domain(d) )
         return;
 
-    ASSERT(d->arch.hvm_domain.vmx.vcpu_block);
-
-    d->arch.hvm_domain.vmx.vcpu_block = NULL;
-    d->arch.hvm_domain.vmx.pi_switch_from = NULL;
-    d->arch.hvm_domain.vmx.pi_switch_to = NULL;
-    d->arch.hvm_domain.vmx.pi_do_resume = NULL;
+    for_each_vcpu ( d, v )
+        vmx_pi_blocking_cleanup(v);
 }
 
 static int vmx_domain_initialise(struct domain *d)
@@ -256,6 +303,8 @@  static int vmx_vcpu_initialise(struct vcpu *v)
 
     INIT_LIST_HEAD(&v->arch.hvm_vmx.pi_blocking.list);
 
+    spin_lock_init(&v->arch.hvm_vmx.pi_hotplug_lock);
+
     v->arch.schedule_tail    = vmx_do_resume;
     v->arch.ctxt_switch_from = vmx_ctxt_switch_from;
     v->arch.ctxt_switch_to   = vmx_ctxt_switch_to;
diff --git a/xen/include/asm-x86/hvm/vmx/vmcs.h b/xen/include/asm-x86/hvm/vmx/vmcs.h
index b54f52f..3834f49 100644
--- a/xen/include/asm-x86/hvm/vmx/vmcs.h
+++ b/xen/include/asm-x86/hvm/vmx/vmcs.h
@@ -231,6 +231,9 @@  struct arch_vmx_struct {
      * pCPU and wakeup the related vCPU.
      */
     struct pi_blocking_vcpu pi_blocking;
+
+    spinlock_t            pi_hotplug_lock;
+    bool_t                pi_blocking_cleaned_up;
 };
 
 int vmx_create_vmcs(struct vcpu *v);