diff mbox series

[v4,07/21] IOMMU/x86: support freeing of pagetables

Message ID 1389c785-ffe3-5d8c-36f1-a923ce5250cd@suse.com (mailing list archive)
State Superseded
Headers show
Series IOMMU: superpage support when not sharing pagetables | expand

Commit Message

Jan Beulich April 25, 2022, 8:35 a.m. UTC
For vendor specific code to support superpages we need to be able to
deal with a superpage mapping replacing an intermediate page table (or
hierarchy thereof). Consequently an iommu_alloc_pgtable() counterpart is
needed to free individual page tables while a domain is still alive.
Since the freeing needs to be deferred until after a suitable IOTLB
flush was performed, released page tables get queued for processing by a
tasklet.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
I was considering whether to use a softirq-tasklet instead. This would
have the benefit of avoiding extra scheduling operations, but come with
the risk of the freeing happening prematurely because of a
process_pending_softirqs() somewhere.
---
v4: Change type of iommu_queue_free_pgtable()'s 1st parameter. Re-base.
v3: Call process_pending_softirqs() from free_queued_pgtables().

Comments

Roger Pau Monné May 3, 2022, 4:20 p.m. UTC | #1
On Mon, Apr 25, 2022 at 10:35:45AM +0200, Jan Beulich wrote:
> For vendor specific code to support superpages we need to be able to
> deal with a superpage mapping replacing an intermediate page table (or
> hierarchy thereof). Consequently an iommu_alloc_pgtable() counterpart is
> needed to free individual page tables while a domain is still alive.
> Since the freeing needs to be deferred until after a suitable IOTLB
> flush was performed, released page tables get queued for processing by a
> tasklet.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> I was considering whether to use a softirq-tasklet instead. This would
> have the benefit of avoiding extra scheduling operations, but come with
> the risk of the freeing happening prematurely because of a
> process_pending_softirqs() somewhere.

I'm sorry again if I already raised this, I don't seem to find a
reference.

What about doing the freeing before resuming the guest execution in
guest vCPU context?

We already have a hook like this on HVM in hvm_do_resume() calling
vpci_process_pending().  I wonder whether we could have a similar hook
for PV and keep the pages to be freed in the vCPU instead of the pCPU.
This would have the benefit of being able to context switch the vCPU
in case the operation takes too long.

Not that the current approach is wrong, but doing it in the guest
resume path we could likely prevent guests doing heavy p2m
modifications from hogging CPU time.

> ---
> v4: Change type of iommu_queue_free_pgtable()'s 1st parameter. Re-base.
> v3: Call process_pending_softirqs() from free_queued_pgtables().
> 
> --- a/xen/arch/x86/include/asm/iommu.h
> +++ b/xen/arch/x86/include/asm/iommu.h
> @@ -147,6 +147,7 @@ void iommu_free_domid(domid_t domid, uns
>  int __must_check iommu_free_pgtables(struct domain *d);
>  struct domain_iommu;
>  struct page_info *__must_check iommu_alloc_pgtable(struct domain_iommu *hd);
> +void iommu_queue_free_pgtable(struct domain_iommu *hd, struct page_info *pg);
>  
>  #endif /* !__ARCH_X86_IOMMU_H__ */
>  /*
> --- a/xen/drivers/passthrough/x86/iommu.c
> +++ b/xen/drivers/passthrough/x86/iommu.c
> @@ -12,6 +12,7 @@
>   * this program; If not, see <http://www.gnu.org/licenses/>.
>   */
>  
> +#include <xen/cpu.h>
>  #include <xen/sched.h>
>  #include <xen/iommu.h>
>  #include <xen/paging.h>
> @@ -550,6 +551,91 @@ struct page_info *iommu_alloc_pgtable(st
>      return pg;
>  }
>  
> +/*
> + * Intermediate page tables which get replaced by large pages may only be
> + * freed after a suitable IOTLB flush. Hence such pages get queued on a
> + * per-CPU list, with a per-CPU tasklet processing the list on the assumption
> + * that the necessary IOTLB flush will have occurred by the time tasklets get
> + * to run. (List and tasklet being per-CPU has the benefit of accesses not
> + * requiring any locking.)
> + */
> +static DEFINE_PER_CPU(struct page_list_head, free_pgt_list);
> +static DEFINE_PER_CPU(struct tasklet, free_pgt_tasklet);
> +
> +static void free_queued_pgtables(void *arg)
> +{
> +    struct page_list_head *list = arg;
> +    struct page_info *pg;
> +    unsigned int done = 0;
> +

With the current logic I think it might be helpful to assert that the
list is not empty when we get here?

Given the operation requires a context switch we would like to avoid
such unless there's indeed pending work to do.

> +    while ( (pg = page_list_remove_head(list)) )
> +    {
> +        free_domheap_page(pg);
> +
> +        /* Granularity of checking somewhat arbitrary. */
> +        if ( !(++done & 0x1ff) )
> +             process_pending_softirqs();
> +    }
> +}
> +
> +void iommu_queue_free_pgtable(struct domain_iommu *hd, struct page_info *pg)
> +{
> +    unsigned int cpu = smp_processor_id();
> +
> +    spin_lock(&hd->arch.pgtables.lock);
> +    page_list_del(pg, &hd->arch.pgtables.list);
> +    spin_unlock(&hd->arch.pgtables.lock);
> +
> +    page_list_add_tail(pg, &per_cpu(free_pgt_list, cpu));
> +
> +    tasklet_schedule(&per_cpu(free_pgt_tasklet, cpu));
> +}
> +
> +static int cf_check cpu_callback(
> +    struct notifier_block *nfb, unsigned long action, void *hcpu)
> +{
> +    unsigned int cpu = (unsigned long)hcpu;
> +    struct page_list_head *list = &per_cpu(free_pgt_list, cpu);
> +    struct tasklet *tasklet = &per_cpu(free_pgt_tasklet, cpu);
> +
> +    switch ( action )
> +    {
> +    case CPU_DOWN_PREPARE:
> +        tasklet_kill(tasklet);
> +        break;
> +
> +    case CPU_DEAD:
> +        page_list_splice(list, &this_cpu(free_pgt_list));

I think you could check whether list is empty before queuing it?

Thanks, Roger.
Jan Beulich May 4, 2022, 1:07 p.m. UTC | #2
On 03.05.2022 18:20, Roger Pau Monné wrote:
> On Mon, Apr 25, 2022 at 10:35:45AM +0200, Jan Beulich wrote:
>> For vendor specific code to support superpages we need to be able to
>> deal with a superpage mapping replacing an intermediate page table (or
>> hierarchy thereof). Consequently an iommu_alloc_pgtable() counterpart is
>> needed to free individual page tables while a domain is still alive.
>> Since the freeing needs to be deferred until after a suitable IOTLB
>> flush was performed, released page tables get queued for processing by a
>> tasklet.
>>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>> ---
>> I was considering whether to use a softirq-tasklet instead. This would
>> have the benefit of avoiding extra scheduling operations, but come with
>> the risk of the freeing happening prematurely because of a
>> process_pending_softirqs() somewhere.
> 
> I'm sorry again if I already raised this, I don't seem to find a
> reference.

Earlier on you only suggested "to perform the freeing after the flush".

> What about doing the freeing before resuming the guest execution in
> guest vCPU context?
> 
> We already have a hook like this on HVM in hvm_do_resume() calling
> vpci_process_pending().  I wonder whether we could have a similar hook
> for PV and keep the pages to be freed in the vCPU instead of the pCPU.
> This would have the benefit of being able to context switch the vCPU
> in case the operation takes too long.

I think this might work in general, but would be troublesome when
preparing Dom0 (where we don't run on any of Dom0's vCPU-s, and we
won't ever "exit to guest context" on an idle vCPU). I'm also not
really fancying to use something like

    v = current->domain == d ? current : d->vcpu[0];

(leaving aside that we don't really have d available in
iommu_queue_free_pgtable() and I'd be hesitant to convert it back).
Otoh it might be okay to free page tables right away for domains
which haven't run at all so far. But this would again require
passing struct domain * to iommu_queue_free_pgtable().

Another upside (I think) of the current approach is that all logic
is contained in a single source file (i.e. in particular there's no
new field needed in a per-vCPU structure defined in some header).

> Not that the current approach is wrong, but doing it in the guest
> resume path we could likely prevent guests doing heavy p2m
> modifications from hogging CPU time.

Well, they would still be hogging time, but that time would then be
accounted towards their time slices, yes.

>> @@ -550,6 +551,91 @@ struct page_info *iommu_alloc_pgtable(st
>>      return pg;
>>  }
>>  
>> +/*
>> + * Intermediate page tables which get replaced by large pages may only be
>> + * freed after a suitable IOTLB flush. Hence such pages get queued on a
>> + * per-CPU list, with a per-CPU tasklet processing the list on the assumption
>> + * that the necessary IOTLB flush will have occurred by the time tasklets get
>> + * to run. (List and tasklet being per-CPU has the benefit of accesses not
>> + * requiring any locking.)
>> + */
>> +static DEFINE_PER_CPU(struct page_list_head, free_pgt_list);
>> +static DEFINE_PER_CPU(struct tasklet, free_pgt_tasklet);
>> +
>> +static void free_queued_pgtables(void *arg)
>> +{
>> +    struct page_list_head *list = arg;
>> +    struct page_info *pg;
>> +    unsigned int done = 0;
>> +
> 
> With the current logic I think it might be helpful to assert that the
> list is not empty when we get here?
> 
> Given the operation requires a context switch we would like to avoid
> such unless there's indeed pending work to do.

But is that worth adding an assertion and risking to kill a system just
because there's a race somewhere by which we might get here without any
work to do? If you strongly think we want to know about such instances,
how about a WARN_ON_ONCE() (except that we still don't have that
specific construct, it would need to be open-coded for the time being)?

>> +static int cf_check cpu_callback(
>> +    struct notifier_block *nfb, unsigned long action, void *hcpu)
>> +{
>> +    unsigned int cpu = (unsigned long)hcpu;
>> +    struct page_list_head *list = &per_cpu(free_pgt_list, cpu);
>> +    struct tasklet *tasklet = &per_cpu(free_pgt_tasklet, cpu);
>> +
>> +    switch ( action )
>> +    {
>> +    case CPU_DOWN_PREPARE:
>> +        tasklet_kill(tasklet);
>> +        break;
>> +
>> +    case CPU_DEAD:
>> +        page_list_splice(list, &this_cpu(free_pgt_list));
> 
> I think you could check whether list is empty before queuing it?

I could, but this would make the code (slightly) more complicated
for improving something which doesn't occur frequently.

Jan
Roger Pau Monné May 4, 2022, 3:06 p.m. UTC | #3
On Wed, May 04, 2022 at 03:07:24PM +0200, Jan Beulich wrote:
> On 03.05.2022 18:20, Roger Pau Monné wrote:
> > On Mon, Apr 25, 2022 at 10:35:45AM +0200, Jan Beulich wrote:
> >> For vendor specific code to support superpages we need to be able to
> >> deal with a superpage mapping replacing an intermediate page table (or
> >> hierarchy thereof). Consequently an iommu_alloc_pgtable() counterpart is
> >> needed to free individual page tables while a domain is still alive.
> >> Since the freeing needs to be deferred until after a suitable IOTLB
> >> flush was performed, released page tables get queued for processing by a
> >> tasklet.
> >>
> >> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> >> ---
> >> I was considering whether to use a softirq-tasklet instead. This would
> >> have the benefit of avoiding extra scheduling operations, but come with
> >> the risk of the freeing happening prematurely because of a
> >> process_pending_softirqs() somewhere.
> > 
> > I'm sorry again if I already raised this, I don't seem to find a
> > reference.
> 
> Earlier on you only suggested "to perform the freeing after the flush".
> 
> > What about doing the freeing before resuming the guest execution in
> > guest vCPU context?
> > 
> > We already have a hook like this on HVM in hvm_do_resume() calling
> > vpci_process_pending().  I wonder whether we could have a similar hook
> > for PV and keep the pages to be freed in the vCPU instead of the pCPU.
> > This would have the benefit of being able to context switch the vCPU
> > in case the operation takes too long.
> 
> I think this might work in general, but would be troublesome when
> preparing Dom0 (where we don't run on any of Dom0's vCPU-s, and we
> won't ever "exit to guest context" on an idle vCPU). I'm also not
> really fancying to use something like
> 
>     v = current->domain == d ? current : d->vcpu[0];

I guess a problematic case would also be hypercalls executed in a
domain context triggering the freeing of a different domain iommu page
table pages.  As then the freeing would be accounted to the current
domain instead of the owner of the pages.

dom0 doesn't seem that problematic, any freeing triggered on a system
domain context could be performed in place (with
process_pending_softirqs() calls to ensure no watchdog triggering).

> (leaving aside that we don't really have d available in
> iommu_queue_free_pgtable() and I'd be hesitant to convert it back).
> Otoh it might be okay to free page tables right away for domains
> which haven't run at all so far.

Could be, but then we would have to make hypercalls that can trigger
those paths preemptible I would think.

> But this would again require
> passing struct domain * to iommu_queue_free_pgtable().

Hm, I guess we could use container_of with the domain_iommu parameter
to obtain a pointer to the domain struct.

> Another upside (I think) of the current approach is that all logic
> is contained in a single source file (i.e. in particular there's no
> new field needed in a per-vCPU structure defined in some header).

Right, I do agree to that.  I'm mostly worried about the resource
starvation aspect.  I guess freeing the pages replaced by a 1G super
page entry is still fine, bigger could be a problem.

> > Not that the current approach is wrong, but doing it in the guest
> > resume path we could likely prevent guests doing heavy p2m
> > modifications from hogging CPU time.
> 
> Well, they would still be hogging time, but that time would then be
> accounted towards their time slices, yes.
> 
> >> @@ -550,6 +551,91 @@ struct page_info *iommu_alloc_pgtable(st
> >>      return pg;
> >>  }
> >>  
> >> +/*
> >> + * Intermediate page tables which get replaced by large pages may only be
> >> + * freed after a suitable IOTLB flush. Hence such pages get queued on a
> >> + * per-CPU list, with a per-CPU tasklet processing the list on the assumption
> >> + * that the necessary IOTLB flush will have occurred by the time tasklets get
> >> + * to run. (List and tasklet being per-CPU has the benefit of accesses not
> >> + * requiring any locking.)
> >> + */
> >> +static DEFINE_PER_CPU(struct page_list_head, free_pgt_list);
> >> +static DEFINE_PER_CPU(struct tasklet, free_pgt_tasklet);
> >> +
> >> +static void free_queued_pgtables(void *arg)
> >> +{
> >> +    struct page_list_head *list = arg;
> >> +    struct page_info *pg;
> >> +    unsigned int done = 0;
> >> +
> > 
> > With the current logic I think it might be helpful to assert that the
> > list is not empty when we get here?
> > 
> > Given the operation requires a context switch we would like to avoid
> > such unless there's indeed pending work to do.
> 
> But is that worth adding an assertion and risking to kill a system just
> because there's a race somewhere by which we might get here without any
> work to do? If you strongly think we want to know about such instances,
> how about a WARN_ON_ONCE() (except that we still don't have that
> specific construct, it would need to be open-coded for the time being)?

Well, I was recommending an assert because I think it's fine to kill a
debug system in order to catch those outliers. On production builds we
should obviously not crash.

> >> +static int cf_check cpu_callback(
> >> +    struct notifier_block *nfb, unsigned long action, void *hcpu)
> >> +{
> >> +    unsigned int cpu = (unsigned long)hcpu;
> >> +    struct page_list_head *list = &per_cpu(free_pgt_list, cpu);
> >> +    struct tasklet *tasklet = &per_cpu(free_pgt_tasklet, cpu);
> >> +
> >> +    switch ( action )
> >> +    {
> >> +    case CPU_DOWN_PREPARE:
> >> +        tasklet_kill(tasklet);
> >> +        break;
> >> +
> >> +    case CPU_DEAD:
> >> +        page_list_splice(list, &this_cpu(free_pgt_list));
> > 
> > I think you could check whether list is empty before queuing it?
> 
> I could, but this would make the code (slightly) more complicated
> for improving something which doesn't occur frequently.

It's just a:

if ( list_empty(list) )
    break;

at the start of the CPU_DEAD case AFAICT.  As you say this notifier is
not to be called frequently, so not a big deal (also I don't think the
addition makes the code more complicated).

Now that I look at the code again, I think there's a
tasklet_schedule() missing in the CPU_DOWN_FAILED case if there are
entries pending on the list list?

Thanks, Roger.
Jan Beulich May 5, 2022, 8:20 a.m. UTC | #4
On 04.05.2022 17:06, Roger Pau Monné wrote:
> On Wed, May 04, 2022 at 03:07:24PM +0200, Jan Beulich wrote:
>> On 03.05.2022 18:20, Roger Pau Monné wrote:
>>> On Mon, Apr 25, 2022 at 10:35:45AM +0200, Jan Beulich wrote:
>>>> For vendor specific code to support superpages we need to be able to
>>>> deal with a superpage mapping replacing an intermediate page table (or
>>>> hierarchy thereof). Consequently an iommu_alloc_pgtable() counterpart is
>>>> needed to free individual page tables while a domain is still alive.
>>>> Since the freeing needs to be deferred until after a suitable IOTLB
>>>> flush was performed, released page tables get queued for processing by a
>>>> tasklet.
>>>>
>>>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>>>> ---
>>>> I was considering whether to use a softirq-tasklet instead. This would
>>>> have the benefit of avoiding extra scheduling operations, but come with
>>>> the risk of the freeing happening prematurely because of a
>>>> process_pending_softirqs() somewhere.
>>>
>>> I'm sorry again if I already raised this, I don't seem to find a
>>> reference.
>>
>> Earlier on you only suggested "to perform the freeing after the flush".
>>
>>> What about doing the freeing before resuming the guest execution in
>>> guest vCPU context?
>>>
>>> We already have a hook like this on HVM in hvm_do_resume() calling
>>> vpci_process_pending().  I wonder whether we could have a similar hook
>>> for PV and keep the pages to be freed in the vCPU instead of the pCPU.
>>> This would have the benefit of being able to context switch the vCPU
>>> in case the operation takes too long.
>>
>> I think this might work in general, but would be troublesome when
>> preparing Dom0 (where we don't run on any of Dom0's vCPU-s, and we
>> won't ever "exit to guest context" on an idle vCPU). I'm also not
>> really fancying to use something like
>>
>>     v = current->domain == d ? current : d->vcpu[0];
> 
> I guess a problematic case would also be hypercalls executed in a
> domain context triggering the freeing of a different domain iommu page
> table pages.  As then the freeing would be accounted to the current
> domain instead of the owner of the pages.

Aiui such can happen only during domain construction. Any such
operation behind the back of a running guest is imo problematic.

> dom0 doesn't seem that problematic, any freeing triggered on a system
> domain context could be performed in place (with
> process_pending_softirqs() calls to ensure no watchdog triggering).
> 
>> (leaving aside that we don't really have d available in
>> iommu_queue_free_pgtable() and I'd be hesitant to convert it back).
>> Otoh it might be okay to free page tables right away for domains
>> which haven't run at all so far.
> 
> Could be, but then we would have to make hypercalls that can trigger
> those paths preemptible I would think.

Yes, if they aren't already and if they allow for freeing of
sufficiently large numbers of pages. That's kind of another argument
against doing so right here, isn't it?

>> But this would again require
>> passing struct domain * to iommu_queue_free_pgtable().
> 
> Hm, I guess we could use container_of with the domain_iommu parameter
> to obtain a pointer to the domain struct.

I was fearing you might suggest this. It would be sort of okay since
the reference to struct domain isn't really altering that struct,
but the goal of limiting what is passed to the function was to
prove that the full struct domain isn't required there. Also doing
so would tie us to the iommu piece actually being a sub-structure of
struct domain, whereas I expect it to become a pointer to a separate
structure sooner or later.

>>>> @@ -550,6 +551,91 @@ struct page_info *iommu_alloc_pgtable(st
>>>>      return pg;
>>>>  }
>>>>  
>>>> +/*
>>>> + * Intermediate page tables which get replaced by large pages may only be
>>>> + * freed after a suitable IOTLB flush. Hence such pages get queued on a
>>>> + * per-CPU list, with a per-CPU tasklet processing the list on the assumption
>>>> + * that the necessary IOTLB flush will have occurred by the time tasklets get
>>>> + * to run. (List and tasklet being per-CPU has the benefit of accesses not
>>>> + * requiring any locking.)
>>>> + */
>>>> +static DEFINE_PER_CPU(struct page_list_head, free_pgt_list);
>>>> +static DEFINE_PER_CPU(struct tasklet, free_pgt_tasklet);
>>>> +
>>>> +static void free_queued_pgtables(void *arg)
>>>> +{
>>>> +    struct page_list_head *list = arg;
>>>> +    struct page_info *pg;
>>>> +    unsigned int done = 0;
>>>> +
>>>
>>> With the current logic I think it might be helpful to assert that the
>>> list is not empty when we get here?
>>>
>>> Given the operation requires a context switch we would like to avoid
>>> such unless there's indeed pending work to do.
>>
>> But is that worth adding an assertion and risking to kill a system just
>> because there's a race somewhere by which we might get here without any
>> work to do? If you strongly think we want to know about such instances,
>> how about a WARN_ON_ONCE() (except that we still don't have that
>> specific construct, it would need to be open-coded for the time being)?
> 
> Well, I was recommending an assert because I think it's fine to kill a
> debug system in order to catch those outliers. On production builds we
> should obviously not crash.

I disagree - such a crash may be rather disturbing to someone doing work
on Xen without being familiar with the IOMMU details.

>>>> +static int cf_check cpu_callback(
>>>> +    struct notifier_block *nfb, unsigned long action, void *hcpu)
>>>> +{
>>>> +    unsigned int cpu = (unsigned long)hcpu;
>>>> +    struct page_list_head *list = &per_cpu(free_pgt_list, cpu);
>>>> +    struct tasklet *tasklet = &per_cpu(free_pgt_tasklet, cpu);
>>>> +
>>>> +    switch ( action )
>>>> +    {
>>>> +    case CPU_DOWN_PREPARE:
>>>> +        tasklet_kill(tasklet);
>>>> +        break;
>>>> +
>>>> +    case CPU_DEAD:
>>>> +        page_list_splice(list, &this_cpu(free_pgt_list));
>>>
>>> I think you could check whether list is empty before queuing it?
>>
>> I could, but this would make the code (slightly) more complicated
>> for improving something which doesn't occur frequently.
> 
> It's just a:
> 
> if ( list_empty(list) )
>     break;
> 
> at the start of the CPU_DEAD case AFAICT.  As you say this notifier is
> not to be called frequently, so not a big deal (also I don't think the
> addition makes the code more complicated).

Okay, I've made that conditional, not the least because I think ...

> Now that I look at the code again, I think there's a
> tasklet_schedule() missing in the CPU_DOWN_FAILED case if there are
> entries pending on the list list?

... this, which indeed was missing, wants to be conditional. While
adding this I did notice that INIT_PAGE_LIST_HEAD() was also missing
for CPU_UP_PREPARE - that's benign for most configs, but necessary
in BIGMEM ones.

Jan
Roger Pau Monné May 5, 2022, 9:57 a.m. UTC | #5
On Thu, May 05, 2022 at 10:20:36AM +0200, Jan Beulich wrote:
> On 04.05.2022 17:06, Roger Pau Monné wrote:
> > On Wed, May 04, 2022 at 03:07:24PM +0200, Jan Beulich wrote:
> >> On 03.05.2022 18:20, Roger Pau Monné wrote:
> >>> On Mon, Apr 25, 2022 at 10:35:45AM +0200, Jan Beulich wrote:
> >>>> For vendor specific code to support superpages we need to be able to
> >>>> deal with a superpage mapping replacing an intermediate page table (or
> >>>> hierarchy thereof). Consequently an iommu_alloc_pgtable() counterpart is
> >>>> needed to free individual page tables while a domain is still alive.
> >>>> Since the freeing needs to be deferred until after a suitable IOTLB
> >>>> flush was performed, released page tables get queued for processing by a
> >>>> tasklet.
> >>>>
> >>>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> >>>> ---
> >>>> I was considering whether to use a softirq-tasklet instead. This would
> >>>> have the benefit of avoiding extra scheduling operations, but come with
> >>>> the risk of the freeing happening prematurely because of a
> >>>> process_pending_softirqs() somewhere.
> >>>
> >>> I'm sorry again if I already raised this, I don't seem to find a
> >>> reference.
> >>
> >> Earlier on you only suggested "to perform the freeing after the flush".
> >>
> >>> What about doing the freeing before resuming the guest execution in
> >>> guest vCPU context?
> >>>
> >>> We already have a hook like this on HVM in hvm_do_resume() calling
> >>> vpci_process_pending().  I wonder whether we could have a similar hook
> >>> for PV and keep the pages to be freed in the vCPU instead of the pCPU.
> >>> This would have the benefit of being able to context switch the vCPU
> >>> in case the operation takes too long.
> >>
> >> I think this might work in general, but would be troublesome when
> >> preparing Dom0 (where we don't run on any of Dom0's vCPU-s, and we
> >> won't ever "exit to guest context" on an idle vCPU). I'm also not
> >> really fancying to use something like
> >>
> >>     v = current->domain == d ? current : d->vcpu[0];
> > 
> > I guess a problematic case would also be hypercalls executed in a
> > domain context triggering the freeing of a different domain iommu page
> > table pages.  As then the freeing would be accounted to the current
> > domain instead of the owner of the pages.
> 
> Aiui such can happen only during domain construction. Any such
> operation behind the back of a running guest is imo problematic.
> 
> > dom0 doesn't seem that problematic, any freeing triggered on a system
> > domain context could be performed in place (with
> > process_pending_softirqs() calls to ensure no watchdog triggering).
> > 
> >> (leaving aside that we don't really have d available in
> >> iommu_queue_free_pgtable() and I'd be hesitant to convert it back).
> >> Otoh it might be okay to free page tables right away for domains
> >> which haven't run at all so far.
> > 
> > Could be, but then we would have to make hypercalls that can trigger
> > those paths preemptible I would think.
> 
> Yes, if they aren't already and if they allow for freeing of
> sufficiently large numbers of pages. That's kind of another argument
> against doing so right here, isn't it?

Indeed, as it's likely to make the implementation more complex IMO.

So let's use this pCPU implementation.

Thanks, Roger.
diff mbox series

Patch

--- a/xen/arch/x86/include/asm/iommu.h
+++ b/xen/arch/x86/include/asm/iommu.h
@@ -147,6 +147,7 @@  void iommu_free_domid(domid_t domid, uns
 int __must_check iommu_free_pgtables(struct domain *d);
 struct domain_iommu;
 struct page_info *__must_check iommu_alloc_pgtable(struct domain_iommu *hd);
+void iommu_queue_free_pgtable(struct domain_iommu *hd, struct page_info *pg);
 
 #endif /* !__ARCH_X86_IOMMU_H__ */
 /*
--- a/xen/drivers/passthrough/x86/iommu.c
+++ b/xen/drivers/passthrough/x86/iommu.c
@@ -12,6 +12,7 @@ 
  * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
+#include <xen/cpu.h>
 #include <xen/sched.h>
 #include <xen/iommu.h>
 #include <xen/paging.h>
@@ -550,6 +551,91 @@  struct page_info *iommu_alloc_pgtable(st
     return pg;
 }
 
+/*
+ * Intermediate page tables which get replaced by large pages may only be
+ * freed after a suitable IOTLB flush. Hence such pages get queued on a
+ * per-CPU list, with a per-CPU tasklet processing the list on the assumption
+ * that the necessary IOTLB flush will have occurred by the time tasklets get
+ * to run. (List and tasklet being per-CPU has the benefit of accesses not
+ * requiring any locking.)
+ */
+static DEFINE_PER_CPU(struct page_list_head, free_pgt_list);
+static DEFINE_PER_CPU(struct tasklet, free_pgt_tasklet);
+
+static void free_queued_pgtables(void *arg)
+{
+    struct page_list_head *list = arg;
+    struct page_info *pg;
+    unsigned int done = 0;
+
+    while ( (pg = page_list_remove_head(list)) )
+    {
+        free_domheap_page(pg);
+
+        /* Granularity of checking somewhat arbitrary. */
+        if ( !(++done & 0x1ff) )
+             process_pending_softirqs();
+    }
+}
+
+void iommu_queue_free_pgtable(struct domain_iommu *hd, struct page_info *pg)
+{
+    unsigned int cpu = smp_processor_id();
+
+    spin_lock(&hd->arch.pgtables.lock);
+    page_list_del(pg, &hd->arch.pgtables.list);
+    spin_unlock(&hd->arch.pgtables.lock);
+
+    page_list_add_tail(pg, &per_cpu(free_pgt_list, cpu));
+
+    tasklet_schedule(&per_cpu(free_pgt_tasklet, cpu));
+}
+
+static int cf_check cpu_callback(
+    struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+    unsigned int cpu = (unsigned long)hcpu;
+    struct page_list_head *list = &per_cpu(free_pgt_list, cpu);
+    struct tasklet *tasklet = &per_cpu(free_pgt_tasklet, cpu);
+
+    switch ( action )
+    {
+    case CPU_DOWN_PREPARE:
+        tasklet_kill(tasklet);
+        break;
+
+    case CPU_DEAD:
+        page_list_splice(list, &this_cpu(free_pgt_list));
+        INIT_PAGE_LIST_HEAD(list);
+        tasklet_schedule(&this_cpu(free_pgt_tasklet));
+        break;
+
+    case CPU_UP_PREPARE:
+    case CPU_DOWN_FAILED:
+        tasklet_init(tasklet, free_queued_pgtables, list);
+        break;
+    }
+
+    return NOTIFY_DONE;
+}
+
+static struct notifier_block cpu_nfb = {
+    .notifier_call = cpu_callback,
+};
+
+static int __init cf_check bsp_init(void)
+{
+    if ( iommu_enabled )
+    {
+        cpu_callback(&cpu_nfb, CPU_UP_PREPARE,
+                     (void *)(unsigned long)smp_processor_id());
+        register_cpu_notifier(&cpu_nfb);
+    }
+
+    return 0;
+}
+presmp_initcall(bsp_init);
+
 bool arch_iommu_use_permitted(const struct domain *d)
 {
     /*