diff mbox series

[v5,03/15] IOMMU/x86: support freeing of pagetables

Message ID 614413d8-5043-f0e3-929b-f161fa89bb35@suse.com (mailing list archive)
State Superseded
Headers show
Series IOMMU: superpage support when not sharing pagetables | expand

Commit Message

Jan Beulich May 27, 2022, 11:13 a.m. UTC
For vendor specific code to support superpages we need to be able to
deal with a superpage mapping replacing an intermediate page table (or
hierarchy thereof). Consequently an iommu_alloc_pgtable() counterpart is
needed to free individual page tables while a domain is still alive.
Since the freeing needs to be deferred until after a suitable IOTLB
flush was performed, released page tables get queued for processing by a
tasklet.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
I was considering whether to use a softirq-tasklet instead. This would
have the benefit of avoiding extra scheduling operations, but come with
the risk of the freeing happening prematurely because of a
process_pending_softirqs() somewhere.
---
v5: Fix CPU_UP_PREPARE for BIGMEM. Schedule tasklet in CPU_DOWN_FAILED
    when list is not empty. Skip all processing in CPU_DEAD when list is
    empty.
v4: Change type of iommu_queue_free_pgtable()'s 1st parameter. Re-base.
v3: Call process_pending_softirqs() from free_queued_pgtables().

Comments

Roger Pau Monné May 31, 2022, 4:25 p.m. UTC | #1
On Fri, May 27, 2022 at 01:13:09PM +0200, Jan Beulich wrote:
> For vendor specific code to support superpages we need to be able to
> deal with a superpage mapping replacing an intermediate page table (or
> hierarchy thereof). Consequently an iommu_alloc_pgtable() counterpart is
> needed to free individual page tables while a domain is still alive.
> Since the freeing needs to be deferred until after a suitable IOTLB
> flush was performed, released page tables get queued for processing by a
> tasklet.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> I was considering whether to use a softirq-tasklet instead. This would
> have the benefit of avoiding extra scheduling operations, but come with
> the risk of the freeing happening prematurely because of a
> process_pending_softirqs() somewhere.
> ---
> v5: Fix CPU_UP_PREPARE for BIGMEM. Schedule tasklet in CPU_DOWN_FAILED
>     when list is not empty. Skip all processing in CPU_DEAD when list is
>     empty.
> v4: Change type of iommu_queue_free_pgtable()'s 1st parameter. Re-base.
> v3: Call process_pending_softirqs() from free_queued_pgtables().
> 
> --- a/xen/arch/x86/include/asm/iommu.h
> +++ b/xen/arch/x86/include/asm/iommu.h
> @@ -147,6 +147,7 @@ void iommu_free_domid(domid_t domid, uns
>  int __must_check iommu_free_pgtables(struct domain *d);
>  struct domain_iommu;
>  struct page_info *__must_check iommu_alloc_pgtable(struct domain_iommu *hd);
> +void iommu_queue_free_pgtable(struct domain_iommu *hd, struct page_info *pg);
>  
>  #endif /* !__ARCH_X86_IOMMU_H__ */
>  /*
> --- a/xen/drivers/passthrough/x86/iommu.c
> +++ b/xen/drivers/passthrough/x86/iommu.c
> @@ -12,6 +12,7 @@
>   * this program; If not, see <http://www.gnu.org/licenses/>.
>   */
>  
> +#include <xen/cpu.h>
>  #include <xen/sched.h>
>  #include <xen/iocap.h>
>  #include <xen/iommu.h>
> @@ -566,6 +567,98 @@ struct page_info *iommu_alloc_pgtable(st
>      return pg;
>  }
>  
> +/*
> + * Intermediate page tables which get replaced by large pages may only be
> + * freed after a suitable IOTLB flush. Hence such pages get queued on a
> + * per-CPU list, with a per-CPU tasklet processing the list on the assumption
> + * that the necessary IOTLB flush will have occurred by the time tasklets get
> + * to run. (List and tasklet being per-CPU has the benefit of accesses not
> + * requiring any locking.)
> + */
> +static DEFINE_PER_CPU(struct page_list_head, free_pgt_list);
> +static DEFINE_PER_CPU(struct tasklet, free_pgt_tasklet);
> +
> +static void free_queued_pgtables(void *arg)
> +{
> +    struct page_list_head *list = arg;
> +    struct page_info *pg;
> +    unsigned int done = 0;
> +
> +    while ( (pg = page_list_remove_head(list)) )
> +    {
> +        free_domheap_page(pg);
> +
> +        /* Granularity of checking somewhat arbitrary. */
> +        if ( !(++done & 0x1ff) )
> +             process_pending_softirqs();

Hm, I'm wondering whether we really want to process pending softirqs
here.

Such processing will prevent the watchdog from triggering, which we
likely want in production builds.  OTOH in debug builds we should make
sure that free_queued_pgtables() doesn't take longer than a watchdog
window, or else it's likely to cause issues to guests scheduled on
this same pCPU (and calling process_pending_softirqs() will just mask
it).

Thanks, Roger.
Jan Beulich June 1, 2022, 7:32 a.m. UTC | #2
On 31.05.2022 18:25, Roger Pau Monné wrote:
> On Fri, May 27, 2022 at 01:13:09PM +0200, Jan Beulich wrote:
>> @@ -566,6 +567,98 @@ struct page_info *iommu_alloc_pgtable(st
>>      return pg;
>>  }
>>  
>> +/*
>> + * Intermediate page tables which get replaced by large pages may only be
>> + * freed after a suitable IOTLB flush. Hence such pages get queued on a
>> + * per-CPU list, with a per-CPU tasklet processing the list on the assumption
>> + * that the necessary IOTLB flush will have occurred by the time tasklets get
>> + * to run. (List and tasklet being per-CPU has the benefit of accesses not
>> + * requiring any locking.)
>> + */
>> +static DEFINE_PER_CPU(struct page_list_head, free_pgt_list);
>> +static DEFINE_PER_CPU(struct tasklet, free_pgt_tasklet);
>> +
>> +static void free_queued_pgtables(void *arg)
>> +{
>> +    struct page_list_head *list = arg;
>> +    struct page_info *pg;
>> +    unsigned int done = 0;
>> +
>> +    while ( (pg = page_list_remove_head(list)) )
>> +    {
>> +        free_domheap_page(pg);
>> +
>> +        /* Granularity of checking somewhat arbitrary. */
>> +        if ( !(++done & 0x1ff) )
>> +             process_pending_softirqs();
> 
> Hm, I'm wondering whether we really want to process pending softirqs
> here.
> 
> Such processing will prevent the watchdog from triggering, which we
> likely want in production builds.  OTOH in debug builds we should make
> sure that free_queued_pgtables() doesn't take longer than a watchdog
> window, or else it's likely to cause issues to guests scheduled on
> this same pCPU (and calling process_pending_softirqs() will just mask
> it).

Doesn't this consideration apply to about every use of the function we
already have in the code base?

Jan
Roger Pau Monné June 1, 2022, 9:24 a.m. UTC | #3
On Wed, Jun 01, 2022 at 09:32:44AM +0200, Jan Beulich wrote:
> On 31.05.2022 18:25, Roger Pau Monné wrote:
> > On Fri, May 27, 2022 at 01:13:09PM +0200, Jan Beulich wrote:
> >> @@ -566,6 +567,98 @@ struct page_info *iommu_alloc_pgtable(st
> >>      return pg;
> >>  }
> >>  
> >> +/*
> >> + * Intermediate page tables which get replaced by large pages may only be
> >> + * freed after a suitable IOTLB flush. Hence such pages get queued on a
> >> + * per-CPU list, with a per-CPU tasklet processing the list on the assumption
> >> + * that the necessary IOTLB flush will have occurred by the time tasklets get
> >> + * to run. (List and tasklet being per-CPU has the benefit of accesses not
> >> + * requiring any locking.)
> >> + */
> >> +static DEFINE_PER_CPU(struct page_list_head, free_pgt_list);
> >> +static DEFINE_PER_CPU(struct tasklet, free_pgt_tasklet);
> >> +
> >> +static void free_queued_pgtables(void *arg)
> >> +{
> >> +    struct page_list_head *list = arg;
> >> +    struct page_info *pg;
> >> +    unsigned int done = 0;
> >> +
> >> +    while ( (pg = page_list_remove_head(list)) )
> >> +    {
> >> +        free_domheap_page(pg);
> >> +
> >> +        /* Granularity of checking somewhat arbitrary. */
> >> +        if ( !(++done & 0x1ff) )
> >> +             process_pending_softirqs();
> > 
> > Hm, I'm wondering whether we really want to process pending softirqs
> > here.
> > 
> > Such processing will prevent the watchdog from triggering, which we
> > likely want in production builds.  OTOH in debug builds we should make
> > sure that free_queued_pgtables() doesn't take longer than a watchdog
> > window, or else it's likely to cause issues to guests scheduled on
> > this same pCPU (and calling process_pending_softirqs() will just mask
> > it).
> 
> Doesn't this consideration apply to about every use of the function we
> already have in the code base?

Not really, at least when used by init code or by the debug key
handlers.  This use is IMO different than what I would expect, as it's
a guest triggered path that we believe do require such processing.
Normally we would use continuations for such long going guest
triggered operations.

Thanks, Roger.
Jan Beulich June 1, 2022, 3:25 p.m. UTC | #4
On 01.06.2022 11:24, Roger Pau Monné wrote:
> On Wed, Jun 01, 2022 at 09:32:44AM +0200, Jan Beulich wrote:
>> On 31.05.2022 18:25, Roger Pau Monné wrote:
>>> On Fri, May 27, 2022 at 01:13:09PM +0200, Jan Beulich wrote:
>>>> @@ -566,6 +567,98 @@ struct page_info *iommu_alloc_pgtable(st
>>>>      return pg;
>>>>  }
>>>>  
>>>> +/*
>>>> + * Intermediate page tables which get replaced by large pages may only be
>>>> + * freed after a suitable IOTLB flush. Hence such pages get queued on a
>>>> + * per-CPU list, with a per-CPU tasklet processing the list on the assumption
>>>> + * that the necessary IOTLB flush will have occurred by the time tasklets get
>>>> + * to run. (List and tasklet being per-CPU has the benefit of accesses not
>>>> + * requiring any locking.)
>>>> + */
>>>> +static DEFINE_PER_CPU(struct page_list_head, free_pgt_list);
>>>> +static DEFINE_PER_CPU(struct tasklet, free_pgt_tasklet);
>>>> +
>>>> +static void free_queued_pgtables(void *arg)
>>>> +{
>>>> +    struct page_list_head *list = arg;
>>>> +    struct page_info *pg;
>>>> +    unsigned int done = 0;
>>>> +
>>>> +    while ( (pg = page_list_remove_head(list)) )
>>>> +    {
>>>> +        free_domheap_page(pg);
>>>> +
>>>> +        /* Granularity of checking somewhat arbitrary. */
>>>> +        if ( !(++done & 0x1ff) )
>>>> +             process_pending_softirqs();
>>>
>>> Hm, I'm wondering whether we really want to process pending softirqs
>>> here.
>>>
>>> Such processing will prevent the watchdog from triggering, which we
>>> likely want in production builds.  OTOH in debug builds we should make
>>> sure that free_queued_pgtables() doesn't take longer than a watchdog
>>> window, or else it's likely to cause issues to guests scheduled on
>>> this same pCPU (and calling process_pending_softirqs() will just mask
>>> it).
>>
>> Doesn't this consideration apply to about every use of the function we
>> already have in the code base?
> 
> Not really, at least when used by init code or by the debug key
> handlers.  This use is IMO different than what I would expect, as it's
> a guest triggered path that we believe do require such processing.
> Normally we would use continuations for such long going guest
> triggered operations.

So what do you suggest I do? Putting the call inside #ifndef CONFIG_DEBUG
is not a good option imo. Re-scheduling the tasklet wouldn't help, aiui
(it would still run again right away). Moving the work to another CPU so
this one can do other things isn't very good either - what if other CPUs
are similarly busy? Remains making things more complicated here by
involving a timer, the handler of which would re-schedule the tasklet. I
have to admit I don't like that very much either. The more that the
use of process_pending_softirqs() is "just in case" here anyway - if lots
of page tables were to be queued, I'd expect the queuing entity to be
preempted before a rather large pile could accumulate.

Maybe I could make iommu_queue_free_pgtable() return non-void, to instruct
the caller to bubble up a preemption notification once a certain number
of pages have been queued for freeing. This might end up intrusive ...

Jan
Roger Pau Monné June 2, 2022, 8:57 a.m. UTC | #5
On Wed, Jun 01, 2022 at 05:25:16PM +0200, Jan Beulich wrote:
> On 01.06.2022 11:24, Roger Pau Monné wrote:
> > On Wed, Jun 01, 2022 at 09:32:44AM +0200, Jan Beulich wrote:
> >> On 31.05.2022 18:25, Roger Pau Monné wrote:
> >>> On Fri, May 27, 2022 at 01:13:09PM +0200, Jan Beulich wrote:
> >>>> @@ -566,6 +567,98 @@ struct page_info *iommu_alloc_pgtable(st
> >>>>      return pg;
> >>>>  }
> >>>>  
> >>>> +/*
> >>>> + * Intermediate page tables which get replaced by large pages may only be
> >>>> + * freed after a suitable IOTLB flush. Hence such pages get queued on a
> >>>> + * per-CPU list, with a per-CPU tasklet processing the list on the assumption
> >>>> + * that the necessary IOTLB flush will have occurred by the time tasklets get
> >>>> + * to run. (List and tasklet being per-CPU has the benefit of accesses not
> >>>> + * requiring any locking.)
> >>>> + */
> >>>> +static DEFINE_PER_CPU(struct page_list_head, free_pgt_list);
> >>>> +static DEFINE_PER_CPU(struct tasklet, free_pgt_tasklet);
> >>>> +
> >>>> +static void free_queued_pgtables(void *arg)
> >>>> +{
> >>>> +    struct page_list_head *list = arg;
> >>>> +    struct page_info *pg;
> >>>> +    unsigned int done = 0;
> >>>> +
> >>>> +    while ( (pg = page_list_remove_head(list)) )
> >>>> +    {
> >>>> +        free_domheap_page(pg);
> >>>> +
> >>>> +        /* Granularity of checking somewhat arbitrary. */
> >>>> +        if ( !(++done & 0x1ff) )
> >>>> +             process_pending_softirqs();
> >>>
> >>> Hm, I'm wondering whether we really want to process pending softirqs
> >>> here.
> >>>
> >>> Such processing will prevent the watchdog from triggering, which we
> >>> likely want in production builds.  OTOH in debug builds we should make
> >>> sure that free_queued_pgtables() doesn't take longer than a watchdog
> >>> window, or else it's likely to cause issues to guests scheduled on
> >>> this same pCPU (and calling process_pending_softirqs() will just mask
> >>> it).
> >>
> >> Doesn't this consideration apply to about every use of the function we
> >> already have in the code base?
> > 
> > Not really, at least when used by init code or by the debug key
> > handlers.  This use is IMO different than what I would expect, as it's
> > a guest triggered path that we believe do require such processing.
> > Normally we would use continuations for such long going guest
> > triggered operations.
> 
> So what do you suggest I do? Putting the call inside #ifndef CONFIG_DEBUG
> is not a good option imo. Re-scheduling the tasklet wouldn't help, aiui
> (it would still run again right away). Moving the work to another CPU so
> this one can do other things isn't very good either - what if other CPUs
> are similarly busy? Remains making things more complicated here by
> involving a timer, the handler of which would re-schedule the tasklet. I
> have to admit I don't like that very much either. The more that the
> use of process_pending_softirqs() is "just in case" here anyway - if lots
> of page tables were to be queued, I'd expect the queuing entity to be
> preempted before a rather large pile could accumulate.

I would be fine with adding a comment here that we don't expect the
processing of softirqs to be necessary, but it's added merely as a
safety guard.

Long term I think we want to do with the approach of freeing such
pages in guest context before resuming execution, much like how we
defer vPCI BAR mappings using vpci_process_pending.  But this requires
some extra work, and we would also need to handle the case of the
freeing being triggered in a remote domain context.

> Maybe I could make iommu_queue_free_pgtable() return non-void, to instruct
> the caller to bubble up a preemption notification once a certain number
> of pages have been queued for freeing. This might end up intrusive ...

Hm, it will end up being pretty arbitrary, as the time taken to free
the pages is very variable depending on the contention on the heap
lock, and hence setting a limit in number of pages will be hard.

Thanks, Roger.
diff mbox series

Patch

--- a/xen/arch/x86/include/asm/iommu.h
+++ b/xen/arch/x86/include/asm/iommu.h
@@ -147,6 +147,7 @@  void iommu_free_domid(domid_t domid, uns
 int __must_check iommu_free_pgtables(struct domain *d);
 struct domain_iommu;
 struct page_info *__must_check iommu_alloc_pgtable(struct domain_iommu *hd);
+void iommu_queue_free_pgtable(struct domain_iommu *hd, struct page_info *pg);
 
 #endif /* !__ARCH_X86_IOMMU_H__ */
 /*
--- a/xen/drivers/passthrough/x86/iommu.c
+++ b/xen/drivers/passthrough/x86/iommu.c
@@ -12,6 +12,7 @@ 
  * this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
+#include <xen/cpu.h>
 #include <xen/sched.h>
 #include <xen/iocap.h>
 #include <xen/iommu.h>
@@ -566,6 +567,98 @@  struct page_info *iommu_alloc_pgtable(st
     return pg;
 }
 
+/*
+ * Intermediate page tables which get replaced by large pages may only be
+ * freed after a suitable IOTLB flush. Hence such pages get queued on a
+ * per-CPU list, with a per-CPU tasklet processing the list on the assumption
+ * that the necessary IOTLB flush will have occurred by the time tasklets get
+ * to run. (List and tasklet being per-CPU has the benefit of accesses not
+ * requiring any locking.)
+ */
+static DEFINE_PER_CPU(struct page_list_head, free_pgt_list);
+static DEFINE_PER_CPU(struct tasklet, free_pgt_tasklet);
+
+static void free_queued_pgtables(void *arg)
+{
+    struct page_list_head *list = arg;
+    struct page_info *pg;
+    unsigned int done = 0;
+
+    while ( (pg = page_list_remove_head(list)) )
+    {
+        free_domheap_page(pg);
+
+        /* Granularity of checking somewhat arbitrary. */
+        if ( !(++done & 0x1ff) )
+             process_pending_softirqs();
+    }
+}
+
+void iommu_queue_free_pgtable(struct domain_iommu *hd, struct page_info *pg)
+{
+    unsigned int cpu = smp_processor_id();
+
+    spin_lock(&hd->arch.pgtables.lock);
+    page_list_del(pg, &hd->arch.pgtables.list);
+    spin_unlock(&hd->arch.pgtables.lock);
+
+    page_list_add_tail(pg, &per_cpu(free_pgt_list, cpu));
+
+    tasklet_schedule(&per_cpu(free_pgt_tasklet, cpu));
+}
+
+static int cf_check cpu_callback(
+    struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+    unsigned int cpu = (unsigned long)hcpu;
+    struct page_list_head *list = &per_cpu(free_pgt_list, cpu);
+    struct tasklet *tasklet = &per_cpu(free_pgt_tasklet, cpu);
+
+    switch ( action )
+    {
+    case CPU_DOWN_PREPARE:
+        tasklet_kill(tasklet);
+        break;
+
+    case CPU_DEAD:
+        if ( !page_list_empty(list) )
+        {
+            page_list_splice(list, &this_cpu(free_pgt_list));
+            INIT_PAGE_LIST_HEAD(list);
+            tasklet_schedule(&this_cpu(free_pgt_tasklet));
+        }
+        break;
+
+    case CPU_UP_PREPARE:
+        INIT_PAGE_LIST_HEAD(list);
+        fallthrough;
+    case CPU_DOWN_FAILED:
+        tasklet_init(tasklet, free_queued_pgtables, list);
+        if ( !page_list_empty(list) )
+            tasklet_schedule(tasklet);
+        break;
+    }
+
+    return NOTIFY_DONE;
+}
+
+static struct notifier_block cpu_nfb = {
+    .notifier_call = cpu_callback,
+};
+
+static int __init cf_check bsp_init(void)
+{
+    if ( iommu_enabled )
+    {
+        cpu_callback(&cpu_nfb, CPU_UP_PREPARE,
+                     (void *)(unsigned long)smp_processor_id());
+        register_cpu_notifier(&cpu_nfb);
+    }
+
+    return 0;
+}
+presmp_initcall(bsp_init);
+
 bool arch_iommu_use_permitted(const struct domain *d)
 {
     /*