diff mbox

[1/3] VMX: Properly adjuest the status of pi descriptor

Message ID 1463734431-22353-2-git-send-email-feng.wu@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Wu, Feng May 20, 2016, 8:53 a.m. UTC
When the last assigned device is dettached from the domain, all
the PI related hooks are removed then, however, the vCPU can be
blocked, switched to another pCPU, etc, all without the aware of
PI. After the next time we attach another device to the domain,
which makes the PI realted hooks avaliable again, the status
of the pi descriptor is not true, we need to properly adjust
it.

Signed-off-by: Feng Wu <feng.wu@intel.com>
---
 xen/arch/x86/hvm/vmx/vmx.c         | 29 ++++++++++++++++++++++++++---
 xen/include/asm-x86/hvm/vmx/vmcs.h |  1 +
 2 files changed, 27 insertions(+), 3 deletions(-)

Comments

Tian, Kevin May 23, 2016, 5:15 a.m. UTC | #1
> From: Wu, Feng
> Sent: Friday, May 20, 2016 4:54 PM
> 
> When the last assigned device is dettached from the domain, all
> the PI related hooks are removed then, however, the vCPU can be
> blocked, switched to another pCPU, etc, all without the aware of
> PI. After the next time we attach another device to the domain,
> which makes the PI realted hooks avaliable again, the status
> of the pi descriptor is not true, we need to properly adjust
> it.

Instead of adjusting pi descriptor in multiple places, can we
simply reset the status (including removing from block list)
right when hooks are removed at deattach?

> 
> Signed-off-by: Feng Wu <feng.wu@intel.com>
> ---
>  xen/arch/x86/hvm/vmx/vmx.c         | 29
> ++++++++++++++++++++++++++---
>  xen/include/asm-x86/hvm/vmx/vmcs.h |  1 +
>  2 files changed, 27 insertions(+), 3 deletions(-)
> 
> diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
> index bc4410f..3fbc7b1 100644
> --- a/xen/arch/x86/hvm/vmx/vmx.c
> +++ b/xen/arch/x86/hvm/vmx/vmx.c
> @@ -107,12 +107,22 @@ void vmx_pi_per_cpu_init(unsigned int cpu)
>  static void vmx_vcpu_block(struct vcpu *v)
>  {
>      unsigned long flags;
> -    unsigned int dest;
> +    unsigned int dest = cpu_physical_id(v->processor);
>      spinlock_t *old_lock;
>      spinlock_t *pi_blocking_list_lock =
>  		&per_cpu(vmx_pi_blocking, v->processor).lock;
>      struct pi_desc *pi_desc = &v->arch.hvm_vmx.pi_desc;
> 
> +    if (v->arch.hvm_vmx.pi_back_from_hotplug == 1)
> +    {
> +        write_atomic(&pi_desc->ndst,
> +                     x2apic_enabled ? dest : MASK_INSR(dest, PI_xAPIC_NDST_MASK));
> +        write_atomic(&pi_desc->nv, posted_intr_vector);
> +        pi_clear_sn(pi_desc);
> +
> +        v->arch.hvm_vmx.pi_back_from_hotplug = 0;
> +    }
> +
>      spin_lock_irqsave(pi_blocking_list_lock, flags);
>      old_lock = cmpxchg(&v->arch.hvm_vmx.pi_blocking.lock, NULL,
>                         pi_blocking_list_lock);
> @@ -130,8 +140,6 @@ static void vmx_vcpu_block(struct vcpu *v)
> 
>      ASSERT(!pi_test_sn(pi_desc));
> 
> -    dest = cpu_physical_id(v->processor);
> -
>      ASSERT(pi_desc->ndst ==
>             (x2apic_enabled ? dest : MASK_INSR(dest, PI_xAPIC_NDST_MASK)));
> 
> @@ -164,6 +172,16 @@ static void vmx_pi_do_resume(struct vcpu *v)
>      unsigned long flags;
>      spinlock_t *pi_blocking_list_lock;
>      struct pi_desc *pi_desc = &v->arch.hvm_vmx.pi_desc;
> +    unsigned int dest = cpu_physical_id(v->processor);
> +
> +    if (v->arch.hvm_vmx.pi_back_from_hotplug == 1)
> +    {
> +        write_atomic(&pi_desc->ndst,
> +                     x2apic_enabled ? dest : MASK_INSR(dest, PI_xAPIC_NDST_MASK));
> +        pi_clear_sn(pi_desc);
> +
> +        v->arch.hvm_vmx.pi_back_from_hotplug = 0;
> +    }
> 
>      ASSERT(!test_bit(_VPF_blocked, &v->pause_flags));
> 
> @@ -202,9 +220,14 @@ static void vmx_pi_do_resume(struct vcpu *v)
>  /* This function is called when pcidevs_lock is held */
>  void vmx_pi_hooks_assign(struct domain *d)
>  {
> +    struct vcpu *v;
> +
>      if ( !iommu_intpost || !has_hvm_container_domain(d) )
>          return;
> 
> +    for_each_vcpu ( d, v )
> +        v->arch.hvm_vmx.pi_back_from_hotplug = 1;
> +
>      ASSERT(!d->arch.hvm_domain.vmx.vcpu_block);
> 
>      d->arch.hvm_domain.vmx.vcpu_block = vmx_vcpu_block;
> diff --git a/xen/include/asm-x86/hvm/vmx/vmcs.h
> b/xen/include/asm-x86/hvm/vmx/vmcs.h
> index b54f52f..3feb60a 100644
> --- a/xen/include/asm-x86/hvm/vmx/vmcs.h
> +++ b/xen/include/asm-x86/hvm/vmx/vmcs.h
> @@ -231,6 +231,7 @@ struct arch_vmx_struct {
>       * pCPU and wakeup the related vCPU.
>       */
>      struct pi_blocking_vcpu pi_blocking;
> +    int pi_back_from_hotplug;
>  };
> 
>  int vmx_create_vmcs(struct vcpu *v);
> --
> 2.1.0
Wu, Feng May 23, 2016, 5:27 a.m. UTC | #2
> -----Original Message-----
> From: Tian, Kevin
> Sent: Monday, May 23, 2016 1:15 PM
> To: Wu, Feng <feng.wu@intel.com>; xen-devel@lists.xen.org
> Cc: keir@xen.org; jbeulich@suse.com; andrew.cooper3@citrix.com;
> george.dunlap@eu.citrix.com; dario.faggioli@citrix.com;
> konrad.wilk@oracle.com
> Subject: RE: [PATCH 1/3] VMX: Properly adjuest the status of pi descriptor
> 
> > From: Wu, Feng
> > Sent: Friday, May 20, 2016 4:54 PM
> >
> > When the last assigned device is dettached from the domain, all
> > the PI related hooks are removed then, however, the vCPU can be
> > blocked, switched to another pCPU, etc, all without the aware of
> > PI. After the next time we attach another device to the domain,
> > which makes the PI realted hooks avaliable again, the status
> > of the pi descriptor is not true, we need to properly adjust
> > it.
> 
> Instead of adjusting pi descriptor in multiple places, can we
> simply reset the status (including removing from block list)
> right when hooks are removed at deattach?
> 

I also thought about this idea before, the problem is that when
the hooks are being removed at the pci device's deattachment,
the hooks may be running at the same time, which may cause
problems, such as, after we have removed the vCPU from the
blocking list, vmx_vcpu_block() (which has been running before the
hooks were removed and not finished yet.) adds a vCPU to the same
blocking list, then this vCPU will remain in the list after the device is
deattached from the domain. At the same reason, if we change PI desc
in vmx_pi_hooks_deassign() while it is being used in the hooks, it
may cause problems.

Thanks,
Feng
Tian, Kevin May 23, 2016, 6:52 a.m. UTC | #3
> From: Wu, Feng
> Sent: Monday, May 23, 2016 1:28 PM
> 
> 
> > -----Original Message-----
> > From: Tian, Kevin
> > Sent: Monday, May 23, 2016 1:15 PM
> > To: Wu, Feng <feng.wu@intel.com>; xen-devel@lists.xen.org
> > Cc: keir@xen.org; jbeulich@suse.com; andrew.cooper3@citrix.com;
> > george.dunlap@eu.citrix.com; dario.faggioli@citrix.com;
> > konrad.wilk@oracle.com
> > Subject: RE: [PATCH 1/3] VMX: Properly adjuest the status of pi descriptor
> >
> > > From: Wu, Feng
> > > Sent: Friday, May 20, 2016 4:54 PM
> > >
> > > When the last assigned device is dettached from the domain, all
> > > the PI related hooks are removed then, however, the vCPU can be
> > > blocked, switched to another pCPU, etc, all without the aware of
> > > PI. After the next time we attach another device to the domain,
> > > which makes the PI realted hooks avaliable again, the status
> > > of the pi descriptor is not true, we need to properly adjust
> > > it.
> >
> > Instead of adjusting pi descriptor in multiple places, can we
> > simply reset the status (including removing from block list)
> > right when hooks are removed at deattach?
> >
> 
> I also thought about this idea before, the problem is that when
> the hooks are being removed at the pci device's deattachment,
> the hooks may be running at the same time, which may cause
> problems, such as, after we have removed the vCPU from the
> blocking list, vmx_vcpu_block() (which has been running before the
> hooks were removed and not finished yet.) adds a vCPU to the same
> blocking list, then this vCPU will remain in the list after the device is
> deattached from the domain. At the same reason, if we change PI desc
> in vmx_pi_hooks_deassign() while it is being used in the hooks, it
> may cause problems.
> 

It's not just about updating PI desc. The race could exist for changing
hooks too:

void vmx_pi_hooks_deassign(struct domain *d)
	...
	d->arch.hvm_domain.vmx.pi_switch_from = NULL;
	...

static void vmx_ctxt_switch_from(struct vcpu *v)
	...
	if ( v->domain->arch.hvm_domain.vmx.pi_switch_from )
        v->domain->arch.hvm_domain.vmx.pi_switch_from(v);

If the callback is cleared between above two lines, you'll refer to
a NULL pointer in the 2nd line.

I thought there should be some protection mechanism in place for
such fundamental race, which might be extended to cover PI desc
too. But looks not the case. Would you mind pointing it out if already
well handled? :-)

Somehow I'm thinking whether we really need such dynamic
callback changes in a SMP environment. Is it safer to introduce
some per-domain flag to indicate above scenario so each callback
can return early with negligible cost so no need to reset them once
registered?

Thanks
Kevin
Wu, Feng May 23, 2016, 7:16 a.m. UTC | #4
> -----Original Message-----
> From: Tian, Kevin
> Sent: Monday, May 23, 2016 2:52 PM
> To: Wu, Feng <feng.wu@intel.com>; xen-devel@lists.xen.org
> Cc: keir@xen.org; jbeulich@suse.com; andrew.cooper3@citrix.com;
> george.dunlap@eu.citrix.com; dario.faggioli@citrix.com;
> konrad.wilk@oracle.com
> Subject: RE: [PATCH 1/3] VMX: Properly adjuest the status of pi descriptor
> 
> > From: Wu, Feng
> > Sent: Monday, May 23, 2016 1:28 PM
> >
> >
> > > -----Original Message-----
> > > From: Tian, Kevin
> > > Sent: Monday, May 23, 2016 1:15 PM
> > > To: Wu, Feng <feng.wu@intel.com>; xen-devel@lists.xen.org
> > > Cc: keir@xen.org; jbeulich@suse.com; andrew.cooper3@citrix.com;
> > > george.dunlap@eu.citrix.com; dario.faggioli@citrix.com;
> > > konrad.wilk@oracle.com
> > > Subject: RE: [PATCH 1/3] VMX: Properly adjuest the status of pi descriptor
> > >
> > > > From: Wu, Feng
> > > > Sent: Friday, May 20, 2016 4:54 PM
> > > >
> > > > When the last assigned device is dettached from the domain, all
> > > > the PI related hooks are removed then, however, the vCPU can be
> > > > blocked, switched to another pCPU, etc, all without the aware of
> > > > PI. After the next time we attach another device to the domain,
> > > > which makes the PI realted hooks avaliable again, the status
> > > > of the pi descriptor is not true, we need to properly adjust
> > > > it.
> > >
> > > Instead of adjusting pi descriptor in multiple places, can we
> > > simply reset the status (including removing from block list)
> > > right when hooks are removed at deattach?
> > >
> >
> > I also thought about this idea before, the problem is that when
> > the hooks are being removed at the pci device's deattachment,
> > the hooks may be running at the same time, which may cause
> > problems, such as, after we have removed the vCPU from the
> > blocking list, vmx_vcpu_block() (which has been running before the
> > hooks were removed and not finished yet.) adds a vCPU to the same
> > blocking list, then this vCPU will remain in the list after the device is
> > deattached from the domain. At the same reason, if we change PI desc
> > in vmx_pi_hooks_deassign() while it is being used in the hooks, it
> > may cause problems.
> >
> 
> It's not just about updating PI desc. The race could exist for changing
> hooks too:
> 
> void vmx_pi_hooks_deassign(struct domain *d)
> 	...
> 	d->arch.hvm_domain.vmx.pi_switch_from = NULL;
> 	...
> 
> static void vmx_ctxt_switch_from(struct vcpu *v)
> 	...
> 	if ( v->domain->arch.hvm_domain.vmx.pi_switch_from )
>         v->domain->arch.hvm_domain.vmx.pi_switch_from(v);
> 
> If the callback is cleared between above two lines, you'll refer to
> a NULL pointer in the 2nd line.
> 
> I thought there should be some protection mechanism in place for
> such fundamental race, which might be extended to cover PI desc
> too. But looks not the case. Would you mind pointing it out if already
> well handled? :-)

You are right, that can happen. However, in this case, it will not introduce
any bad impacts other than the PI desc is remaining the old value.
But the device is deattched, so it doesn't matter, as long as we
re-adjust it to the right one once the device is attached. And this
is what I am doing in this patch.

> 
> Somehow I'm thinking whether we really need such dynamic
> callback changes in a SMP environment. Is it safer to introduce
> some per-domain flag to indicate above scenario so each callback
> can return early with negligible cost so no need to reset them once
> registered?

Yes, I agree with you. As you can see, the current issue is kind of because
of the dynamically assigning/deassigning the pi hooks, and there are
lots of concern cases by using it. I think it worth try what you mentioned
above. However, this method was sugguested by Jan before, so I'd like
to know what is Jan's option about this.

Hi Jan, any ideas? Thanks!

Thanks,
Feng

> 
> Thanks
> Kevin
Jan Beulich May 23, 2016, 9:03 a.m. UTC | #5
>>> On 23.05.16 at 09:16, <feng.wu@intel.com> wrote:

> 
>> -----Original Message-----
>> From: Tian, Kevin
>> Sent: Monday, May 23, 2016 2:52 PM
>> To: Wu, Feng <feng.wu@intel.com>; xen-devel@lists.xen.org 
>> Cc: keir@xen.org; jbeulich@suse.com; andrew.cooper3@citrix.com;
>> george.dunlap@eu.citrix.com; dario.faggioli@citrix.com;
>> konrad.wilk@oracle.com 
>> Subject: RE: [PATCH 1/3] VMX: Properly adjuest the status of pi descriptor
>> 
>> > From: Wu, Feng
>> > Sent: Monday, May 23, 2016 1:28 PM
>> >
>> >
>> > > -----Original Message-----
>> > > From: Tian, Kevin
>> > > Sent: Monday, May 23, 2016 1:15 PM
>> > > To: Wu, Feng <feng.wu@intel.com>; xen-devel@lists.xen.org 
>> > > Cc: keir@xen.org; jbeulich@suse.com; andrew.cooper3@citrix.com;
>> > > george.dunlap@eu.citrix.com; dario.faggioli@citrix.com;
>> > > konrad.wilk@oracle.com 
>> > > Subject: RE: [PATCH 1/3] VMX: Properly adjuest the status of pi descriptor
>> > >
>> > > > From: Wu, Feng
>> > > > Sent: Friday, May 20, 2016 4:54 PM
>> > > >
>> > > > When the last assigned device is dettached from the domain, all
>> > > > the PI related hooks are removed then, however, the vCPU can be
>> > > > blocked, switched to another pCPU, etc, all without the aware of
>> > > > PI. After the next time we attach another device to the domain,
>> > > > which makes the PI realted hooks avaliable again, the status
>> > > > of the pi descriptor is not true, we need to properly adjust
>> > > > it.
>> > >
>> > > Instead of adjusting pi descriptor in multiple places, can we
>> > > simply reset the status (including removing from block list)
>> > > right when hooks are removed at deattach?
>> > >
>> >
>> > I also thought about this idea before, the problem is that when
>> > the hooks are being removed at the pci device's deattachment,
>> > the hooks may be running at the same time, which may cause
>> > problems, such as, after we have removed the vCPU from the
>> > blocking list, vmx_vcpu_block() (which has been running before the
>> > hooks were removed and not finished yet.) adds a vCPU to the same
>> > blocking list, then this vCPU will remain in the list after the device is
>> > deattached from the domain. At the same reason, if we change PI desc
>> > in vmx_pi_hooks_deassign() while it is being used in the hooks, it
>> > may cause problems.
>> >
>> 
>> It's not just about updating PI desc. The race could exist for changing
>> hooks too:
>> 
>> void vmx_pi_hooks_deassign(struct domain *d)
>> 	...
>> 	d->arch.hvm_domain.vmx.pi_switch_from = NULL;
>> 	...
>> 
>> static void vmx_ctxt_switch_from(struct vcpu *v)
>> 	...
>> 	if ( v->domain->arch.hvm_domain.vmx.pi_switch_from )
>>         v->domain->arch.hvm_domain.vmx.pi_switch_from(v);
>> 
>> If the callback is cleared between above two lines, you'll refer to
>> a NULL pointer in the 2nd line.
>> 
>> I thought there should be some protection mechanism in place for
>> such fundamental race, which might be extended to cover PI desc
>> too. But looks not the case. Would you mind pointing it out if already
>> well handled? :-)
> 
> You are right, that can happen. However, in this case, it will not introduce
> any bad impacts other than the PI desc is remaining the old value.

How that, if you agree that what Kevin described can happen? He's
pointing out a possible NULL deref...

>> Somehow I'm thinking whether we really need such dynamic
>> callback changes in a SMP environment. Is it safer to introduce
>> some per-domain flag to indicate above scenario so each callback
>> can return early with negligible cost so no need to reset them once
>> registered?
> 
> Yes, I agree with you. As you can see, the current issue is kind of because
> of the dynamically assigning/deassigning the pi hooks, and there are
> lots of concern cases by using it. I think it worth try what you mentioned
> above. However, this method was sugguested by Jan before, so I'd like
> to know what is Jan's option about this.

Avoiding the indirect calls would certainly be nice. Apart from
reading the function pointer into a local variable, use of which is
separated from the read by a barrier() (to address the NULL
deref concern), did you consider zapping the pointers in an RCU
callback instead of right when being requested?

Jan
Wu, Feng May 23, 2016, 9:21 a.m. UTC | #6
> -----Original Message-----
> From: Jan Beulich [mailto:JBeulich@suse.com]
> Sent: Monday, May 23, 2016 5:04 PM
> To: Wu, Feng <feng.wu@intel.com>
> Cc: andrew.cooper3@citrix.com; dario.faggioli@citrix.com;
> george.dunlap@eu.citrix.com; Tian, Kevin <kevin.tian@intel.com>; xen-
> devel@lists.xen.org; konrad.wilk@oracle.com; keir@xen.org
> Subject: RE: [PATCH 1/3] VMX: Properly adjuest the status of pi descriptor
> 
> >>> On 23.05.16 at 09:16, <feng.wu@intel.com> wrote:
> 
> >
> >> -----Original Message-----
> >> From: Tian, Kevin
> >> Sent: Monday, May 23, 2016 2:52 PM
> >> To: Wu, Feng <feng.wu@intel.com>; xen-devel@lists.xen.org
> >> Cc: keir@xen.org; jbeulich@suse.com; andrew.cooper3@citrix.com;
> >> george.dunlap@eu.citrix.com; dario.faggioli@citrix.com;
> >> konrad.wilk@oracle.com
> >> Subject: RE: [PATCH 1/3] VMX: Properly adjuest the status of pi descriptor
> >>
> >> > From: Wu, Feng
> >> > Sent: Monday, May 23, 2016 1:28 PM
> >> >
> >> >
> >> > > -----Original Message-----
> >> > > From: Tian, Kevin
> >> > > Sent: Monday, May 23, 2016 1:15 PM
> >> > > To: Wu, Feng <feng.wu@intel.com>; xen-devel@lists.xen.org
> >> > > Cc: keir@xen.org; jbeulich@suse.com; andrew.cooper3@citrix.com;
> >> > > george.dunlap@eu.citrix.com; dario.faggioli@citrix.com;
> >> > > konrad.wilk@oracle.com
> >> > > Subject: RE: [PATCH 1/3] VMX: Properly adjuest the status of pi descriptor
> >> > >
> >> > > > From: Wu, Feng
> >> > > > Sent: Friday, May 20, 2016 4:54 PM
> >> > > >
> >> > > > When the last assigned device is dettached from the domain, all
> >> > > > the PI related hooks are removed then, however, the vCPU can be
> >> > > > blocked, switched to another pCPU, etc, all without the aware of
> >> > > > PI. After the next time we attach another device to the domain,
> >> > > > which makes the PI realted hooks avaliable again, the status
> >> > > > of the pi descriptor is not true, we need to properly adjust
> >> > > > it.
> >> > >
> >> > > Instead of adjusting pi descriptor in multiple places, can we
> >> > > simply reset the status (including removing from block list)
> >> > > right when hooks are removed at deattach?
> >> > >
> >> >
> >> > I also thought about this idea before, the problem is that when
> >> > the hooks are being removed at the pci device's deattachment,
> >> > the hooks may be running at the same time, which may cause
> >> > problems, such as, after we have removed the vCPU from the
> >> > blocking list, vmx_vcpu_block() (which has been running before the
> >> > hooks were removed and not finished yet.) adds a vCPU to the same
> >> > blocking list, then this vCPU will remain in the list after the device is
> >> > deattached from the domain. At the same reason, if we change PI desc
> >> > in vmx_pi_hooks_deassign() while it is being used in the hooks, it
> >> > may cause problems.
> >> >
> >>
> >> It's not just about updating PI desc. The race could exist for changing
> >> hooks too:
> >>
> >> void vmx_pi_hooks_deassign(struct domain *d)
> >> 	...
> >> 	d->arch.hvm_domain.vmx.pi_switch_from = NULL;
> >> 	...
> >>
> >> static void vmx_ctxt_switch_from(struct vcpu *v)
> >> 	...
> >> 	if ( v->domain->arch.hvm_domain.vmx.pi_switch_from )
> >>         v->domain->arch.hvm_domain.vmx.pi_switch_from(v);
> >>
> >> If the callback is cleared between above two lines, you'll refer to
> >> a NULL pointer in the 2nd line.
> >>
> >> I thought there should be some protection mechanism in place for
> >> such fundamental race, which might be extended to cover PI desc
> >> too. But looks not the case. Would you mind pointing it out if already
> >> well handled? :-)
> >
> > You are right, that can happen. However, in this case, it will not introduce
> > any bad impacts other than the PI desc is remaining the old value.
> 
> How that, if you agree that what Kevin described can happen? He's
> pointing out a possible NULL deref...

Oh, sorry, I misunderstood Kevin's point. Yes, that should be an issue.

> 
> >> Somehow I'm thinking whether we really need such dynamic
> >> callback changes in a SMP environment. Is it safer to introduce
> >> some per-domain flag to indicate above scenario so each callback
> >> can return early with negligible cost so no need to reset them once
> >> registered?
> >
> > Yes, I agree with you. As you can see, the current issue is kind of because
> > of the dynamically assigning/deassigning the pi hooks, and there are
> > lots of concern cases by using it. I think it worth try what you mentioned
> > above. However, this method was sugguested by Jan before, so I'd like
> > to know what is Jan's option about this.
> 
> Avoiding the indirect calls would certainly be nice. Apart from
> reading the function pointer into a local variable, use of which is
> separated from the read by a barrier() (to address the NULL
> deref concern), did you consider zapping the pointers in an RCU
> callback instead of right when being requested?

Is there any example for the RCU callback in current Xen code,
so I can first have some investigation on it.

Thanks,
Feng

> 
> Jan
Jan Beulich May 23, 2016, 11:04 a.m. UTC | #7
>>> On 23.05.16 at 11:21, <feng.wu@intel.com> wrote:
>> From: Jan Beulich [mailto:JBeulich@suse.com]
>> Sent: Monday, May 23, 2016 5:04 PM
>> >>> On 23.05.16 at 09:16, <feng.wu@intel.com> wrote:
>> >> From: Tian, Kevin
>> >> Sent: Monday, May 23, 2016 2:52 PM
>> >> Somehow I'm thinking whether we really need such dynamic
>> >> callback changes in a SMP environment. Is it safer to introduce
>> >> some per-domain flag to indicate above scenario so each callback
>> >> can return early with negligible cost so no need to reset them once
>> >> registered?
>> >
>> > Yes, I agree with you. As you can see, the current issue is kind of because
>> > of the dynamically assigning/deassigning the pi hooks, and there are
>> > lots of concern cases by using it. I think it worth try what you mentioned
>> > above. However, this method was sugguested by Jan before, so I'd like
>> > to know what is Jan's option about this.
>> 
>> Avoiding the indirect calls would certainly be nice. Apart from
>> reading the function pointer into a local variable, use of which is
>> separated from the read by a barrier() (to address the NULL
>> deref concern), did you consider zapping the pointers in an RCU
>> callback instead of right when being requested?
> 
> Is there any example for the RCU callback in current Xen code,
> so I can first have some investigation on it.

Did you try to find any before asking, e.g. by grep-ing for "rcu",
or (more precisely) "call_rcu"?

Jan
Jan Beulich May 23, 2016, 12:30 p.m. UTC | #8
>>> On 20.05.16 at 10:53, <feng.wu@intel.com> wrote:
> --- a/xen/arch/x86/hvm/vmx/vmx.c
> +++ b/xen/arch/x86/hvm/vmx/vmx.c
> @@ -107,12 +107,22 @@ void vmx_pi_per_cpu_init(unsigned int cpu)
>  static void vmx_vcpu_block(struct vcpu *v)
>  {
>      unsigned long flags;
> -    unsigned int dest;
> +    unsigned int dest = cpu_physical_id(v->processor);
>      spinlock_t *old_lock;
>      spinlock_t *pi_blocking_list_lock =
>  		&per_cpu(vmx_pi_blocking, v->processor).lock;
>      struct pi_desc *pi_desc = &v->arch.hvm_vmx.pi_desc;
>  
> +    if (v->arch.hvm_vmx.pi_back_from_hotplug == 1)

Coding style (missing blanks). Also the flag is of boolean nature, so
it should be declared bool_t and you should drop the "== 1" here.

> +    {
> +        write_atomic(&pi_desc->ndst,
> +                     x2apic_enabled ? dest : MASK_INSR(dest, PI_xAPIC_NDST_MASK));
> +        write_atomic(&pi_desc->nv, posted_intr_vector);
> +        pi_clear_sn(pi_desc);
> +
> +        v->arch.hvm_vmx.pi_back_from_hotplug = 0;
> +    }
> +
>      spin_lock_irqsave(pi_blocking_list_lock, flags);
>      old_lock = cmpxchg(&v->arch.hvm_vmx.pi_blocking.lock, NULL,
>                         pi_blocking_list_lock);
> @@ -130,8 +140,6 @@ static void vmx_vcpu_block(struct vcpu *v)
>  
>      ASSERT(!pi_test_sn(pi_desc));
>  
> -    dest = cpu_physical_id(v->processor);
> -
>      ASSERT(pi_desc->ndst ==
>             (x2apic_enabled ? dest : MASK_INSR(dest, PI_xAPIC_NDST_MASK)));
>  
> @@ -164,6 +172,16 @@ static void vmx_pi_do_resume(struct vcpu *v)
>      unsigned long flags;
>      spinlock_t *pi_blocking_list_lock;
>      struct pi_desc *pi_desc = &v->arch.hvm_vmx.pi_desc;
> +    unsigned int dest = cpu_physical_id(v->processor);
> +
> +    if (v->arch.hvm_vmx.pi_back_from_hotplug == 1)
> +    {
> +        write_atomic(&pi_desc->ndst,
> +                     x2apic_enabled ? dest : MASK_INSR(dest, PI_xAPIC_NDST_MASK));
> +        pi_clear_sn(pi_desc);
> +
> +        v->arch.hvm_vmx.pi_back_from_hotplug = 0;
> +    }

Compared with the adjustment above you don't write ->nv here, as
that happens ...

>      ASSERT(!test_bit(_VPF_blocked, &v->pause_flags));

after this ASSERT() (which suggests that your code addition would
better go here). That raises the question of moving the entire body
of the first hunk's if() into a helper function, which then would also
be used here. That would then also address my question about
ordering: Shouldn't pi_clear_sn() always happen last?

Furthermore, are both of these additions really eliminating the
issue rather than just reducing the likelihood for it to occur? I.e.
what if the new flag gets set immediately after either of the if()-s
above?

> @@ -202,9 +220,14 @@ static void vmx_pi_do_resume(struct vcpu *v)
>  /* This function is called when pcidevs_lock is held */
>  void vmx_pi_hooks_assign(struct domain *d)
>  {
> +    struct vcpu *v;
> +
>      if ( !iommu_intpost || !has_hvm_container_domain(d) )
>          return;
>  
> +    for_each_vcpu ( d, v )
> +        v->arch.hvm_vmx.pi_back_from_hotplug = 1;

Thinking of possible alternatives: What about doing the necessary
adjustments right here, instead of setting the new flag?

Of course it may still be the case that the whole issue means we
indeed shouldn't zap those hooks upon device removal.

Jan
diff mbox

Patch

diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index bc4410f..3fbc7b1 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -107,12 +107,22 @@  void vmx_pi_per_cpu_init(unsigned int cpu)
 static void vmx_vcpu_block(struct vcpu *v)
 {
     unsigned long flags;
-    unsigned int dest;
+    unsigned int dest = cpu_physical_id(v->processor);
     spinlock_t *old_lock;
     spinlock_t *pi_blocking_list_lock =
 		&per_cpu(vmx_pi_blocking, v->processor).lock;
     struct pi_desc *pi_desc = &v->arch.hvm_vmx.pi_desc;
 
+    if (v->arch.hvm_vmx.pi_back_from_hotplug == 1)
+    {
+        write_atomic(&pi_desc->ndst,
+                     x2apic_enabled ? dest : MASK_INSR(dest, PI_xAPIC_NDST_MASK));
+        write_atomic(&pi_desc->nv, posted_intr_vector);
+        pi_clear_sn(pi_desc);
+
+        v->arch.hvm_vmx.pi_back_from_hotplug = 0;
+    }
+
     spin_lock_irqsave(pi_blocking_list_lock, flags);
     old_lock = cmpxchg(&v->arch.hvm_vmx.pi_blocking.lock, NULL,
                        pi_blocking_list_lock);
@@ -130,8 +140,6 @@  static void vmx_vcpu_block(struct vcpu *v)
 
     ASSERT(!pi_test_sn(pi_desc));
 
-    dest = cpu_physical_id(v->processor);
-
     ASSERT(pi_desc->ndst ==
            (x2apic_enabled ? dest : MASK_INSR(dest, PI_xAPIC_NDST_MASK)));
 
@@ -164,6 +172,16 @@  static void vmx_pi_do_resume(struct vcpu *v)
     unsigned long flags;
     spinlock_t *pi_blocking_list_lock;
     struct pi_desc *pi_desc = &v->arch.hvm_vmx.pi_desc;
+    unsigned int dest = cpu_physical_id(v->processor);
+
+    if (v->arch.hvm_vmx.pi_back_from_hotplug == 1)
+    {
+        write_atomic(&pi_desc->ndst,
+                     x2apic_enabled ? dest : MASK_INSR(dest, PI_xAPIC_NDST_MASK));
+        pi_clear_sn(pi_desc);
+
+        v->arch.hvm_vmx.pi_back_from_hotplug = 0;
+    }
 
     ASSERT(!test_bit(_VPF_blocked, &v->pause_flags));
 
@@ -202,9 +220,14 @@  static void vmx_pi_do_resume(struct vcpu *v)
 /* This function is called when pcidevs_lock is held */
 void vmx_pi_hooks_assign(struct domain *d)
 {
+    struct vcpu *v;
+
     if ( !iommu_intpost || !has_hvm_container_domain(d) )
         return;
 
+    for_each_vcpu ( d, v )
+        v->arch.hvm_vmx.pi_back_from_hotplug = 1;
+
     ASSERT(!d->arch.hvm_domain.vmx.vcpu_block);
 
     d->arch.hvm_domain.vmx.vcpu_block = vmx_vcpu_block;
diff --git a/xen/include/asm-x86/hvm/vmx/vmcs.h b/xen/include/asm-x86/hvm/vmx/vmcs.h
index b54f52f..3feb60a 100644
--- a/xen/include/asm-x86/hvm/vmx/vmcs.h
+++ b/xen/include/asm-x86/hvm/vmx/vmcs.h
@@ -231,6 +231,7 @@  struct arch_vmx_struct {
      * pCPU and wakeup the related vCPU.
      */
     struct pi_blocking_vcpu pi_blocking;
+    int pi_back_from_hotplug;
 };
 
 int vmx_create_vmcs(struct vcpu *v);