diff mbox series

[v2,(resend),09/27] x86/pv: Rewrite how building PV dom0 handles domheap mappings

Message ID 20240116192611.41112-10-eliasely@amazon.com (mailing list archive)
State New
Headers show
Series Remove the directmap | expand

Commit Message

Elias El Yandouzi Jan. 16, 2024, 7:25 p.m. UTC
From: Hongyan Xia <hongyxia@amazon.com>

Building a PV dom0 is allocating from the domheap but uses it like the
xenheap. Use the pages as they should be.

Signed-off-by: Hongyan Xia <hongyxia@amazon.com>
Signed-off-by: Julien Grall <jgrall@amazon.com>
Signed-off-by: Elias El Yandouzi <eliasely@amazon.com>

----

    Changes in V2:
        * Clarify the commit message
        * Break the patch in two parts

    Changes since Hongyan's version:
        * Rebase
        * Remove spurious newline

Comments

Jan Beulich Feb. 20, 2024, 10:28 a.m. UTC | #1
On 16.01.2024 20:25, Elias El Yandouzi wrote:
> --- a/xen/arch/x86/pv/dom0_build.c
> +++ b/xen/arch/x86/pv/dom0_build.c
> @@ -382,6 +382,10 @@ int __init dom0_construct_pv(struct domain *d,
>      l3_pgentry_t *l3tab = NULL, *l3start = NULL;
>      l2_pgentry_t *l2tab = NULL, *l2start = NULL;
>      l1_pgentry_t *l1tab = NULL, *l1start = NULL;
> +    mfn_t l4start_mfn = INVALID_MFN;
> +    mfn_t l3start_mfn = INVALID_MFN;
> +    mfn_t l2start_mfn = INVALID_MFN;
> +    mfn_t l1start_mfn = INVALID_MFN;

The reason initializers are needed here is, aiui, the overly large scope
of these variables. For example ...

> @@ -708,22 +712,32 @@ int __init dom0_construct_pv(struct domain *d,
>          v->arch.pv.event_callback_cs    = FLAT_COMPAT_KERNEL_CS;
>      }
>  
> +#define UNMAP_MAP_AND_ADVANCE(mfn_var, virt_var, maddr) \
> +do {                                                    \
> +    unmap_domain_page(virt_var);                        \
> +    mfn_var = maddr_to_mfn(maddr);                      \
> +    maddr += PAGE_SIZE;                                 \
> +    virt_var = map_domain_page(mfn_var);                \
> +} while ( false )
> +
>      if ( !compat )
>      {
>          maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l4_page_table;
> -        l4start = l4tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
> +        UNMAP_MAP_AND_ADVANCE(l4start_mfn, l4start, mpt_alloc);
> +        l4tab = l4start;
>          clear_page(l4tab);
> -        init_xen_l4_slots(l4tab, _mfn(virt_to_mfn(l4start)),
> -                          d, INVALID_MFN, true);
> -        v->arch.guest_table = pagetable_from_paddr(__pa(l4start));
> +        init_xen_l4_slots(l4tab, l4start_mfn, d, INVALID_MFN, true);
> +        v->arch.guest_table = pagetable_from_mfn(l4start_mfn);

... looks to be required only here, while ...

>      }
>      else
>      {
>          /* Monitor table already created by switch_compat(). */
> -        l4start = l4tab = __va(pagetable_get_paddr(v->arch.guest_table));
> +        l4start_mfn = pagetable_get_mfn(v->arch.guest_table);
> +        l4start = l4tab = map_domain_page(l4start_mfn);

... in principle the use of the variable could be avoided here. Below
from here there's no further use of it.

> @@ -781,30 +797,34 @@ int __init dom0_construct_pv(struct domain *d,
>  
>      if ( compat )
>      {
> -        l2_pgentry_t *l2t;
> -
>          /* Ensure the first four L3 entries are all populated. */
>          for ( i = 0, l3tab = l3start; i < 4; ++i, ++l3tab )
>          {
>              if ( !l3e_get_intpte(*l3tab) )
>              {
>                  maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l2_page_table;
> -                l2tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
> -                clear_page(l2tab);
> -                *l3tab = l3e_from_paddr(__pa(l2tab), L3_PROT);
> +                UNMAP_MAP_AND_ADVANCE(l2start_mfn, l2start, mpt_alloc);
> +                clear_page(l2start);
> +                *l3tab = l3e_from_mfn(l2start_mfn, L3_PROT);
>              }

The updating of l2start is only conditional here, yet ...

>              if ( i == 3 )
>                  l3e_get_page(*l3tab)->u.inuse.type_info |= PGT_pae_xen_l2;
>          }
>  
> -        l2t = map_l2t_from_l3e(l3start[3]);
> -        init_xen_pae_l2_slots(l2t, d);
> -        unmap_domain_page(l2t);
> +        init_xen_pae_l2_slots(l2start, d);

... here you assume it points at the page referenced by the 3rd L3 entry.
Question is why the original code is being replaced here in the first
place: It was already suitably mapping the page in question.

Jan
Elias El Yandouzi May 7, 2024, 3:21 p.m. UTC | #2
> On 20/02/2024 10:28, Jan Beulich wrote:
>> On 16.01.2024 20:25, Elias El Yandouzi wrote:
>> --- a/xen/arch/x86/pv/dom0_build.c
>> +++ b/xen/arch/x86/pv/dom0_build.c
>> @@ -382,6 +382,10 @@ int __init dom0_construct_pv(struct domain *d,
>>       l3_pgentry_t *l3tab = NULL, *l3start = NULL;
>>       l2_pgentry_t *l2tab = NULL, *l2start = NULL;
>>       l1_pgentry_t *l1tab = NULL, *l1start = NULL;
>> +    mfn_t l4start_mfn = INVALID_MFN;
>> +    mfn_t l3start_mfn = INVALID_MFN;
>> +    mfn_t l2start_mfn = INVALID_MFN;
>> +    mfn_t l1start_mfn = INVALID_MFN;
> 
> The reason initializers are needed here is, aiui, the overly large scope
> of these variables. For example ...
> 

Correct, is it just an observation or do you want me to do anything?

>> @@ -708,22 +712,32 @@ int __init dom0_construct_pv(struct domain *d,
>>           v->arch.pv.event_callback_cs    = FLAT_COMPAT_KERNEL_CS;
>>       }
>>   
>> +#define UNMAP_MAP_AND_ADVANCE(mfn_var, virt_var, maddr) \
>> +do {                                                    \
>> +    unmap_domain_page(virt_var);                        \
>> +    mfn_var = maddr_to_mfn(maddr);                      \
>> +    maddr += PAGE_SIZE;                                 \
>> +    virt_var = map_domain_page(mfn_var);                \
>> +} while ( false )
>> +
>>       if ( !compat )
>>       {
>>           maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l4_page_table;
>> -        l4start = l4tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
>> +        UNMAP_MAP_AND_ADVANCE(l4start_mfn, l4start, mpt_alloc);
>> +        l4tab = l4start;
>>           clear_page(l4tab);
>> -        init_xen_l4_slots(l4tab, _mfn(virt_to_mfn(l4start)),
>> -                          d, INVALID_MFN, true);
>> -        v->arch.guest_table = pagetable_from_paddr(__pa(l4start));
>> +        init_xen_l4_slots(l4tab, l4start_mfn, d, INVALID_MFN, true);
>> +        v->arch.guest_table = pagetable_from_mfn(l4start_mfn);
> 
> ... looks to be required only here, while ...
> 
>>       }
>>       else
>>       {
>>           /* Monitor table already created by switch_compat(). */
>> -        l4start = l4tab = __va(pagetable_get_paddr(v->arch.guest_table));
>> +        l4start_mfn = pagetable_get_mfn(v->arch.guest_table);
>> +        l4start = l4tab = map_domain_page(l4start_mfn);
> 
> ... in principle the use of the variable could be avoided here. Below
> from here there's no further use of it.
> 
>> @@ -781,30 +797,34 @@ int __init dom0_construct_pv(struct domain *d,
>>   
>>       if ( compat )
>>       {
>> -        l2_pgentry_t *l2t;
>> -
>>           /* Ensure the first four L3 entries are all populated. */
>>           for ( i = 0, l3tab = l3start; i < 4; ++i, ++l3tab )
>>           {
>>               if ( !l3e_get_intpte(*l3tab) )
>>               {
>>                   maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l2_page_table;
>> -                l2tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
>> -                clear_page(l2tab);
>> -                *l3tab = l3e_from_paddr(__pa(l2tab), L3_PROT);
>> +                UNMAP_MAP_AND_ADVANCE(l2start_mfn, l2start, mpt_alloc);
>> +                clear_page(l2start);
>> +                *l3tab = l3e_from_mfn(l2start_mfn, L3_PROT);
>>               }
> 
> The updating of l2start is only conditional here, yet ...
> 
>>               if ( i == 3 )
>>                   l3e_get_page(*l3tab)->u.inuse.type_info |= PGT_pae_xen_l2;
>>           }
>>   
>> -        l2t = map_l2t_from_l3e(l3start[3]);
>> -        init_xen_pae_l2_slots(l2t, d);
>> -        unmap_domain_page(l2t);
>> +        init_xen_pae_l2_slots(l2start, d);
> 
> ... here you assume it points at the page referenced by the 3rd L3 entry.

Hmm, I missed it when sending the revision and indeed it doesn't look 
correct.

> Question is why the original code is being replaced here in the first
> place: It was already suitably mapping the page in question.

The code was already suitably mapping the pages in question. This patch 
doesn't aim to make any functional change, just to rework how the 
domheap pages are used. The goal of the series is to remove the mappings 
from the directmap, which means those pages needs to be mapped and 
unmapped when required.

This is all this patch do, see `UNMAP_MAP_AND_ADVANCE()` macro.

Elias
Jan Beulich May 14, 2024, 9:52 a.m. UTC | #3
On 07.05.2024 17:21, Elias El Yandouzi wrote:
>  > On 20/02/2024 10:28, Jan Beulich wrote:
>>> On 16.01.2024 20:25, Elias El Yandouzi wrote:
>>> --- a/xen/arch/x86/pv/dom0_build.c
>>> +++ b/xen/arch/x86/pv/dom0_build.c
>>> @@ -382,6 +382,10 @@ int __init dom0_construct_pv(struct domain *d,
>>>       l3_pgentry_t *l3tab = NULL, *l3start = NULL;
>>>       l2_pgentry_t *l2tab = NULL, *l2start = NULL;
>>>       l1_pgentry_t *l1tab = NULL, *l1start = NULL;
>>> +    mfn_t l4start_mfn = INVALID_MFN;
>>> +    mfn_t l3start_mfn = INVALID_MFN;
>>> +    mfn_t l2start_mfn = INVALID_MFN;
>>> +    mfn_t l1start_mfn = INVALID_MFN;
>>
>> The reason initializers are needed here is, aiui, the overly large scope
>> of these variables. For example ...
> 
> Correct, is it just an observation or do you want me to do anything?

Where possible reducing the scope of variables would be preferred. Hence
why I ...

>>> @@ -708,22 +712,32 @@ int __init dom0_construct_pv(struct domain *d,
>>>           v->arch.pv.event_callback_cs    = FLAT_COMPAT_KERNEL_CS;
>>>       }
>>>   
>>> +#define UNMAP_MAP_AND_ADVANCE(mfn_var, virt_var, maddr) \
>>> +do {                                                    \
>>> +    unmap_domain_page(virt_var);                        \
>>> +    mfn_var = maddr_to_mfn(maddr);                      \
>>> +    maddr += PAGE_SIZE;                                 \
>>> +    virt_var = map_domain_page(mfn_var);                \
>>> +} while ( false )
>>> +
>>>       if ( !compat )
>>>       {
>>>           maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l4_page_table;
>>> -        l4start = l4tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
>>> +        UNMAP_MAP_AND_ADVANCE(l4start_mfn, l4start, mpt_alloc);
>>> +        l4tab = l4start;
>>>           clear_page(l4tab);
>>> -        init_xen_l4_slots(l4tab, _mfn(virt_to_mfn(l4start)),
>>> -                          d, INVALID_MFN, true);
>>> -        v->arch.guest_table = pagetable_from_paddr(__pa(l4start));
>>> +        init_xen_l4_slots(l4tab, l4start_mfn, d, INVALID_MFN, true);
>>> +        v->arch.guest_table = pagetable_from_mfn(l4start_mfn);
>>
>> ... looks to be required only here, while ...
>>
>>>       }
>>>       else
>>>       {
>>>           /* Monitor table already created by switch_compat(). */
>>> -        l4start = l4tab = __va(pagetable_get_paddr(v->arch.guest_table));
>>> +        l4start_mfn = pagetable_get_mfn(v->arch.guest_table);
>>> +        l4start = l4tab = map_domain_page(l4start_mfn);
>>
>> ... in principle the use of the variable could be avoided here. Below
>> from here there's no further use of it.

... went into some detail towards that possibility.

>>> @@ -781,30 +797,34 @@ int __init dom0_construct_pv(struct domain *d,
>>>   
>>>       if ( compat )
>>>       {
>>> -        l2_pgentry_t *l2t;
>>> -
>>>           /* Ensure the first four L3 entries are all populated. */
>>>           for ( i = 0, l3tab = l3start; i < 4; ++i, ++l3tab )
>>>           {
>>>               if ( !l3e_get_intpte(*l3tab) )
>>>               {
>>>                   maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l2_page_table;
>>> -                l2tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
>>> -                clear_page(l2tab);
>>> -                *l3tab = l3e_from_paddr(__pa(l2tab), L3_PROT);
>>> +                UNMAP_MAP_AND_ADVANCE(l2start_mfn, l2start, mpt_alloc);
>>> +                clear_page(l2start);
>>> +                *l3tab = l3e_from_mfn(l2start_mfn, L3_PROT);
>>>               }
>>
>> The updating of l2start is only conditional here, yet ...
>>
>>>               if ( i == 3 )
>>>                   l3e_get_page(*l3tab)->u.inuse.type_info |= PGT_pae_xen_l2;
>>>           }
>>>   
>>> -        l2t = map_l2t_from_l3e(l3start[3]);
>>> -        init_xen_pae_l2_slots(l2t, d);
>>> -        unmap_domain_page(l2t);
>>> +        init_xen_pae_l2_slots(l2start, d);
>>
>> ... here you assume it points at the page referenced by the 3rd L3 entry.
> 
> Hmm, I missed it when sending the revision and indeed it doesn't look 
> correct.
> 
>> Question is why the original code is being replaced here in the first
>> place: It was already suitably mapping the page in question.
> 
> The code was already suitably mapping the pages in question. This patch 
> doesn't aim to make any functional change, just to rework how the 
> domheap pages are used. The goal of the series is to remove the mappings 
> from the directmap, which means those pages needs to be mapped and 
> unmapped when required.

But that doesn't address my question: If there's nothing wrong with the
earlier code, why does it need changing (right here)?

Jan
diff mbox series

Patch

diff --git a/xen/arch/x86/pv/dom0_build.c b/xen/arch/x86/pv/dom0_build.c
index 5659814e0c..dc5e9fe117 100644
--- a/xen/arch/x86/pv/dom0_build.c
+++ b/xen/arch/x86/pv/dom0_build.c
@@ -382,6 +382,10 @@  int __init dom0_construct_pv(struct domain *d,
     l3_pgentry_t *l3tab = NULL, *l3start = NULL;
     l2_pgentry_t *l2tab = NULL, *l2start = NULL;
     l1_pgentry_t *l1tab = NULL, *l1start = NULL;
+    mfn_t l4start_mfn = INVALID_MFN;
+    mfn_t l3start_mfn = INVALID_MFN;
+    mfn_t l2start_mfn = INVALID_MFN;
+    mfn_t l1start_mfn = INVALID_MFN;
 
     /*
      * This fully describes the memory layout of the initial domain. All
@@ -708,22 +712,32 @@  int __init dom0_construct_pv(struct domain *d,
         v->arch.pv.event_callback_cs    = FLAT_COMPAT_KERNEL_CS;
     }
 
+#define UNMAP_MAP_AND_ADVANCE(mfn_var, virt_var, maddr) \
+do {                                                    \
+    unmap_domain_page(virt_var);                        \
+    mfn_var = maddr_to_mfn(maddr);                      \
+    maddr += PAGE_SIZE;                                 \
+    virt_var = map_domain_page(mfn_var);                \
+} while ( false )
+
     if ( !compat )
     {
         maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l4_page_table;
-        l4start = l4tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
+        UNMAP_MAP_AND_ADVANCE(l4start_mfn, l4start, mpt_alloc);
+        l4tab = l4start;
         clear_page(l4tab);
-        init_xen_l4_slots(l4tab, _mfn(virt_to_mfn(l4start)),
-                          d, INVALID_MFN, true);
-        v->arch.guest_table = pagetable_from_paddr(__pa(l4start));
+        init_xen_l4_slots(l4tab, l4start_mfn, d, INVALID_MFN, true);
+        v->arch.guest_table = pagetable_from_mfn(l4start_mfn);
     }
     else
     {
         /* Monitor table already created by switch_compat(). */
-        l4start = l4tab = __va(pagetable_get_paddr(v->arch.guest_table));
+        l4start_mfn = pagetable_get_mfn(v->arch.guest_table);
+        l4start = l4tab = map_domain_page(l4start_mfn);
         /* See public/xen.h on why the following is needed. */
         maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l3_page_table;
         l3start = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
+        UNMAP_MAP_AND_ADVANCE(l3start_mfn, l3start, mpt_alloc);
     }
 
     l4tab += l4_table_offset(v_start);
@@ -733,14 +747,16 @@  int __init dom0_construct_pv(struct domain *d,
         if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) )
         {
             maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l1_page_table;
-            l1start = l1tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
+            UNMAP_MAP_AND_ADVANCE(l1start_mfn, l1start, mpt_alloc);
+            l1tab = l1start;
             clear_page(l1tab);
             if ( count == 0 )
                 l1tab += l1_table_offset(v_start);
             if ( !((unsigned long)l2tab & (PAGE_SIZE-1)) )
             {
                 maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l2_page_table;
-                l2start = l2tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
+                UNMAP_MAP_AND_ADVANCE(l2start_mfn, l2start, mpt_alloc);
+                l2tab = l2start;
                 clear_page(l2tab);
                 if ( count == 0 )
                     l2tab += l2_table_offset(v_start);
@@ -750,19 +766,19 @@  int __init dom0_construct_pv(struct domain *d,
                     {
                         maddr_to_page(mpt_alloc)->u.inuse.type_info =
                             PGT_l3_page_table;
-                        l3start = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
+                        UNMAP_MAP_AND_ADVANCE(l3start_mfn, l3start, mpt_alloc);
                     }
                     l3tab = l3start;
                     clear_page(l3tab);
                     if ( count == 0 )
                         l3tab += l3_table_offset(v_start);
-                    *l4tab = l4e_from_paddr(__pa(l3start), L4_PROT);
+                    *l4tab = l4e_from_mfn(l3start_mfn, L4_PROT);
                     l4tab++;
                 }
-                *l3tab = l3e_from_paddr(__pa(l2start), L3_PROT);
+                *l3tab = l3e_from_mfn(l2start_mfn, L3_PROT);
                 l3tab++;
             }
-            *l2tab = l2e_from_paddr(__pa(l1start), L2_PROT);
+            *l2tab = l2e_from_mfn(l1start_mfn, L2_PROT);
             l2tab++;
         }
         if ( count < initrd_pfn || count >= initrd_pfn + PFN_UP(initrd_len) )
@@ -781,30 +797,34 @@  int __init dom0_construct_pv(struct domain *d,
 
     if ( compat )
     {
-        l2_pgentry_t *l2t;
-
         /* Ensure the first four L3 entries are all populated. */
         for ( i = 0, l3tab = l3start; i < 4; ++i, ++l3tab )
         {
             if ( !l3e_get_intpte(*l3tab) )
             {
                 maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l2_page_table;
-                l2tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
-                clear_page(l2tab);
-                *l3tab = l3e_from_paddr(__pa(l2tab), L3_PROT);
+                UNMAP_MAP_AND_ADVANCE(l2start_mfn, l2start, mpt_alloc);
+                clear_page(l2start);
+                *l3tab = l3e_from_mfn(l2start_mfn, L3_PROT);
             }
             if ( i == 3 )
                 l3e_get_page(*l3tab)->u.inuse.type_info |= PGT_pae_xen_l2;
         }
 
-        l2t = map_l2t_from_l3e(l3start[3]);
-        init_xen_pae_l2_slots(l2t, d);
-        unmap_domain_page(l2t);
+        init_xen_pae_l2_slots(l2start, d);
     }
 
+#undef UNMAP_MAP_AND_ADVANCE
+
+    UNMAP_DOMAIN_PAGE(l1start);
+    UNMAP_DOMAIN_PAGE(l2start);
+    UNMAP_DOMAIN_PAGE(l3start);
+
     /* Pages that are part of page tables must be read only. */
     mark_pv_pt_pages_rdonly(d, l4start, vpt_start, nr_pt_pages, &flush_flags);
 
+    UNMAP_DOMAIN_PAGE(l4start);
+
     /* Mask all upcalls... */
     for ( i = 0; i < XEN_LEGACY_MAX_VCPUS; i++ )
         shared_info(d, vcpu_info[i].evtchn_upcall_mask) = 1;