diff mbox series

[v4,5/9] x86/mem-sharing: copy GADDR based shared guest areas

Message ID 6047c192-0f37-e4ff-5980-fd137b3f1869@suse.com (mailing list archive)
State New, archived
Headers show
Series runstate/time area registration by (guest) physical address | expand

Commit Message

Jan Beulich Sept. 28, 2023, 7:16 a.m. UTC
In preparation of the introduction of new vCPU operations allowing to
register the respective areas (one of the two is x86-specific) by
guest-physical address, add the necessary fork handling (with the
backing function yet to be filled in).

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: Extend comment.

Comments

Roger Pau Monne Sept. 28, 2023, 9:51 a.m. UTC | #1
On Thu, Sep 28, 2023 at 09:16:20AM +0200, Jan Beulich wrote:
> In preparation of the introduction of new vCPU operations allowing to
> register the respective areas (one of the two is x86-specific) by
> guest-physical address, add the necessary fork handling (with the
> backing function yet to be filled in).
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> v3: Extend comment.
> 
> --- a/xen/arch/x86/mm/mem_sharing.c
> +++ b/xen/arch/x86/mm/mem_sharing.c
> @@ -1641,6 +1641,68 @@ static void copy_vcpu_nonreg_state(struc
>      hvm_set_nonreg_state(cd_vcpu, &nrs);
>  }
>  
> +static int copy_guest_area(struct guest_area *cd_area,
> +                           const struct guest_area *d_area,
> +                           struct vcpu *cd_vcpu,
> +                           const struct domain *d)
> +{
> +    mfn_t d_mfn, cd_mfn;
> +
> +    if ( !d_area->pg )
> +        return 0;
> +
> +    d_mfn = page_to_mfn(d_area->pg);
> +
> +    /* Allocate & map a page for the area if it hasn't been already. */
> +    if ( !cd_area->pg )
> +    {
> +        gfn_t gfn = mfn_to_gfn(d, d_mfn);
> +        struct p2m_domain *p2m = p2m_get_hostp2m(cd_vcpu->domain);
> +        p2m_type_t p2mt;
> +        p2m_access_t p2ma;
> +        unsigned int offset;
> +        int ret;
> +
> +        cd_mfn = p2m->get_entry(p2m, gfn, &p2mt, &p2ma, 0, NULL, NULL);
> +        if ( mfn_eq(cd_mfn, INVALID_MFN) )
> +        {
> +            struct page_info *pg = alloc_domheap_page(cd_vcpu->domain, 0);
> +
> +            if ( !pg )
> +                return -ENOMEM;
> +
> +            cd_mfn = page_to_mfn(pg);
> +            set_gpfn_from_mfn(mfn_x(cd_mfn), gfn_x(gfn));
> +
> +            ret = p2m->set_entry(p2m, gfn, cd_mfn, PAGE_ORDER_4K, p2m_ram_rw,
> +                                 p2m->default_access, -1);
> +            if ( ret )
> +                return ret;
> +        }

I'm still unsure why map_guest_area() shouldn't be able to deal with a
forked child needing the page to be mapped.  What happens when a
forked child executes the hypercall to map such areas against not yet
populated gfns?

Shouldn't map_guest_area() be capable of handling those calls and
populating on demand?

> +        else if ( p2mt != p2m_ram_rw )
> +            return -EBUSY;
> +
> +        /*
> +         * Map the area into the guest. For simplicity specify the entire range
> +         * up to the end of the page: All the function uses it for is to check
> +         * that the range doesn't cross page boundaries. Having the area mapped
> +         * in the original domain implies that it fits there and therefore will
> +         * also fit in the clone.
> +         */
> +        offset = PAGE_OFFSET(d_area->map);
> +        ret = map_guest_area(cd_vcpu, gfn_to_gaddr(gfn) + offset,
> +                             PAGE_SIZE - offset, cd_area, NULL);
> +        if ( ret )
> +            return ret;
> +    }
> +    else
> +        cd_mfn = page_to_mfn(cd_area->pg);
> +
> +    copy_domain_page(cd_mfn, d_mfn);

I think the page copy should be done only once, when the page is
populated on the child p2m.  Otherwise areas smaller than a page size
(like vpcu_time_info_t) that share the same page will get multiple
copies of the same data for no reason.

Thanks, Roger.
Jan Beulich Sept. 28, 2023, 10:11 a.m. UTC | #2
On 28.09.2023 11:51, Roger Pau Monné wrote:
> On Thu, Sep 28, 2023 at 09:16:20AM +0200, Jan Beulich wrote:
>> --- a/xen/arch/x86/mm/mem_sharing.c
>> +++ b/xen/arch/x86/mm/mem_sharing.c
>> @@ -1641,6 +1641,68 @@ static void copy_vcpu_nonreg_state(struc
>>      hvm_set_nonreg_state(cd_vcpu, &nrs);
>>  }
>>  
>> +static int copy_guest_area(struct guest_area *cd_area,
>> +                           const struct guest_area *d_area,
>> +                           struct vcpu *cd_vcpu,
>> +                           const struct domain *d)
>> +{
>> +    mfn_t d_mfn, cd_mfn;
>> +
>> +    if ( !d_area->pg )
>> +        return 0;
>> +
>> +    d_mfn = page_to_mfn(d_area->pg);
>> +
>> +    /* Allocate & map a page for the area if it hasn't been already. */
>> +    if ( !cd_area->pg )
>> +    {
>> +        gfn_t gfn = mfn_to_gfn(d, d_mfn);
>> +        struct p2m_domain *p2m = p2m_get_hostp2m(cd_vcpu->domain);
>> +        p2m_type_t p2mt;
>> +        p2m_access_t p2ma;
>> +        unsigned int offset;
>> +        int ret;
>> +
>> +        cd_mfn = p2m->get_entry(p2m, gfn, &p2mt, &p2ma, 0, NULL, NULL);
>> +        if ( mfn_eq(cd_mfn, INVALID_MFN) )
>> +        {
>> +            struct page_info *pg = alloc_domheap_page(cd_vcpu->domain, 0);
>> +
>> +            if ( !pg )
>> +                return -ENOMEM;
>> +
>> +            cd_mfn = page_to_mfn(pg);
>> +            set_gpfn_from_mfn(mfn_x(cd_mfn), gfn_x(gfn));
>> +
>> +            ret = p2m->set_entry(p2m, gfn, cd_mfn, PAGE_ORDER_4K, p2m_ram_rw,
>> +                                 p2m->default_access, -1);
>> +            if ( ret )
>> +                return ret;
>> +        }
> 
> I'm still unsure why map_guest_area() shouldn't be able to deal with a
> forked child needing the page to be mapped.  What happens when a
> forked child executes the hypercall to map such areas against not yet
> populated gfns?
> 
> Shouldn't map_guest_area() be capable of handling those calls and
> populating on demand?

Funny you should use this wording: P2M_ALLOC will deal with populating
PoD slots, yes, but aiui forks don't fill their p2m with all PoD slots,
but rather leave them empty. Yet again I need to leave it to Tamas to
confirm or prove me wrong.

>> +        else if ( p2mt != p2m_ram_rw )
>> +            return -EBUSY;
>> +
>> +        /*
>> +         * Map the area into the guest. For simplicity specify the entire range
>> +         * up to the end of the page: All the function uses it for is to check
>> +         * that the range doesn't cross page boundaries. Having the area mapped
>> +         * in the original domain implies that it fits there and therefore will
>> +         * also fit in the clone.
>> +         */
>> +        offset = PAGE_OFFSET(d_area->map);
>> +        ret = map_guest_area(cd_vcpu, gfn_to_gaddr(gfn) + offset,
>> +                             PAGE_SIZE - offset, cd_area, NULL);
>> +        if ( ret )
>> +            return ret;
>> +    }
>> +    else
>> +        cd_mfn = page_to_mfn(cd_area->pg);
>> +
>> +    copy_domain_page(cd_mfn, d_mfn);
> 
> I think the page copy should be done only once, when the page is
> populated on the child p2m.  Otherwise areas smaller than a page size
> (like vpcu_time_info_t) that share the same page will get multiple
> copies of the same data for no reason.

I think you're right, but this would then be another issue in the original
code that I merely didn't spot (and it's not just "copy for no reason",
we'd actually corrupt what was put there before). IOW the copying needs to
move ahead of map_guest_area() (or yet more precisely after the error
checking for p2m->set_entry()), and in the original code it would have
needed to live ahead of map_vcpu_info(). Once again I'd like Tamas to
confirm (or otherwise) before making that change, though.

Jan
Roger Pau Monne Sept. 28, 2023, 11:07 a.m. UTC | #3
On Thu, Sep 28, 2023 at 12:11:02PM +0200, Jan Beulich wrote:
> On 28.09.2023 11:51, Roger Pau Monné wrote:
> > On Thu, Sep 28, 2023 at 09:16:20AM +0200, Jan Beulich wrote:
> >> --- a/xen/arch/x86/mm/mem_sharing.c
> >> +++ b/xen/arch/x86/mm/mem_sharing.c
> >> @@ -1641,6 +1641,68 @@ static void copy_vcpu_nonreg_state(struc
> >>      hvm_set_nonreg_state(cd_vcpu, &nrs);
> >>  }
> >>  
> >> +static int copy_guest_area(struct guest_area *cd_area,
> >> +                           const struct guest_area *d_area,
> >> +                           struct vcpu *cd_vcpu,
> >> +                           const struct domain *d)
> >> +{
> >> +    mfn_t d_mfn, cd_mfn;
> >> +
> >> +    if ( !d_area->pg )
> >> +        return 0;
> >> +
> >> +    d_mfn = page_to_mfn(d_area->pg);
> >> +
> >> +    /* Allocate & map a page for the area if it hasn't been already. */
> >> +    if ( !cd_area->pg )
> >> +    {
> >> +        gfn_t gfn = mfn_to_gfn(d, d_mfn);
> >> +        struct p2m_domain *p2m = p2m_get_hostp2m(cd_vcpu->domain);
> >> +        p2m_type_t p2mt;
> >> +        p2m_access_t p2ma;
> >> +        unsigned int offset;
> >> +        int ret;
> >> +
> >> +        cd_mfn = p2m->get_entry(p2m, gfn, &p2mt, &p2ma, 0, NULL, NULL);
> >> +        if ( mfn_eq(cd_mfn, INVALID_MFN) )
> >> +        {
> >> +            struct page_info *pg = alloc_domheap_page(cd_vcpu->domain, 0);
> >> +
> >> +            if ( !pg )
> >> +                return -ENOMEM;
> >> +
> >> +            cd_mfn = page_to_mfn(pg);
> >> +            set_gpfn_from_mfn(mfn_x(cd_mfn), gfn_x(gfn));
> >> +
> >> +            ret = p2m->set_entry(p2m, gfn, cd_mfn, PAGE_ORDER_4K, p2m_ram_rw,
> >> +                                 p2m->default_access, -1);
> >> +            if ( ret )
> >> +                return ret;
> >> +        }
> > 
> > I'm still unsure why map_guest_area() shouldn't be able to deal with a
> > forked child needing the page to be mapped.  What happens when a
> > forked child executes the hypercall to map such areas against not yet
> > populated gfns?
> > 
> > Shouldn't map_guest_area() be capable of handling those calls and
> > populating on demand?
> 
> Funny you should use this wording: P2M_ALLOC will deal with populating
> PoD slots, yes, but aiui forks don't fill their p2m with all PoD slots,
> but rather leave them empty. Yet again I need to leave it to Tamas to
> confirm or prove me wrong.

If the child p2m populating is only triggered by guest accesses then a
lot of hypercalls are likely to not work correctly on childs.

> 
> >> +        else if ( p2mt != p2m_ram_rw )
> >> +            return -EBUSY;
> >> +
> >> +        /*
> >> +         * Map the area into the guest. For simplicity specify the entire range
> >> +         * up to the end of the page: All the function uses it for is to check
> >> +         * that the range doesn't cross page boundaries. Having the area mapped
> >> +         * in the original domain implies that it fits there and therefore will
> >> +         * also fit in the clone.
> >> +         */
> >> +        offset = PAGE_OFFSET(d_area->map);
> >> +        ret = map_guest_area(cd_vcpu, gfn_to_gaddr(gfn) + offset,
> >> +                             PAGE_SIZE - offset, cd_area, NULL);
> >> +        if ( ret )
> >> +            return ret;
> >> +    }
> >> +    else
> >> +        cd_mfn = page_to_mfn(cd_area->pg);
> >> +
> >> +    copy_domain_page(cd_mfn, d_mfn);
> > 
> > I think the page copy should be done only once, when the page is
> > populated on the child p2m.  Otherwise areas smaller than a page size
> > (like vpcu_time_info_t) that share the same page will get multiple
> > copies of the same data for no reason.
> 
> I think you're right, but this would then be another issue in the original
> code that I merely didn't spot (and it's not just "copy for no reason",
> we'd actually corrupt what was put there before). IOW the copying needs to
> move ahead of map_guest_area() (or yet more precisely after the error
> checking for p2m->set_entry()), and in the original code it would have
> needed to live ahead of map_vcpu_info(). Once again I'd like Tamas to
> confirm (or otherwise) before making that change, though.

Yes, it's already an issue in the current code.  I wonder whether
logic in the guest or Xen could malfunctions due to the fact that
map_vcpu_info() unconditionally sets evtchn_upcall_pending and injects
an event channel upcall, but the later call to copy_domain_page()
might unset evtchn_upcall_pending while the vector is already injected.

Thanks, Roger.
Jan Beulich Sept. 28, 2023, 1:19 p.m. UTC | #4
On 28.09.2023 14:57, Tamas K Lengyel wrote:
> On Thu, Sep 28, 2023 at 7:08 AM Roger Pau Monné <roger.pau@citrix.com> wrote:
>> On Thu, Sep 28, 2023 at 12:11:02PM +0200, Jan Beulich wrote:
>>> On 28.09.2023 11:51, Roger Pau Monné wrote:
>>>> On Thu, Sep 28, 2023 at 09:16:20AM +0200, Jan Beulich wrote:
>>>>> +        /*
>>>>> +         * Map the area into the guest. For simplicity specify the entire range
>>>>> +         * up to the end of the page: All the function uses it for is to check
>>>>> +         * that the range doesn't cross page boundaries. Having the area mapped
>>>>> +         * in the original domain implies that it fits there and therefore will
>>>>> +         * also fit in the clone.
>>>>> +         */
>>>>> +        offset = PAGE_OFFSET(d_area->map);
>>>>> +        ret = map_guest_area(cd_vcpu, gfn_to_gaddr(gfn) + offset,
>>>>> +                             PAGE_SIZE - offset, cd_area, NULL);
>>>>> +        if ( ret )
>>>>> +            return ret;
>>>>> +    }
>>>>> +    else
>>>>> +        cd_mfn = page_to_mfn(cd_area->pg);
>>>>> +
>>>>> +    copy_domain_page(cd_mfn, d_mfn);
>>>>
>>>> I think the page copy should be done only once, when the page is
>>>> populated on the child p2m.  Otherwise areas smaller than a page size
>>>> (like vpcu_time_info_t) that share the same page will get multiple
>>>> copies of the same data for no reason.
>>>
>>> I think you're right, but this would then be another issue in the original
>>> code that I merely didn't spot (and it's not just "copy for no reason",
>>> we'd actually corrupt what was put there before). IOW the copying needs to
>>> move ahead of map_guest_area() (or yet more precisely after the error
>>> checking for p2m->set_entry()), and in the original code it would have
>>> needed to live ahead of map_vcpu_info(). Once again I'd like Tamas to
>>> confirm (or otherwise) before making that change, though.
>>
>> Yes, it's already an issue in the current code.  I wonder whether
>> logic in the guest or Xen could malfunctions due to the fact that
>> map_vcpu_info() unconditionally sets evtchn_upcall_pending and injects
>> an event channel upcall, but the later call to copy_domain_page()
>> might unset evtchn_upcall_pending while the vector is already injected.
> 
> Sorry but I really don't follow the discussion here. My understanding
> was that map_vcpu_info, as its name suggests, maps the page. We use it
> to map a new page into that position in case the fork hasn't set it up
> yet but the parent has one. Then we follow with the copy from the
> parent so the page content is matching. If there is already a
> vcpu_info page in the fork, we just do the copy.
> 
> Now, if map_vcpu_info does more then mapping, then I don't know what
> it does, why it does it, and what happens if we skip it when the fork
> is reset for example. Is the suggestion to call it map_vcpu_info every
> time the page content is reset (ie after the copy)?

The vCPU info area (already prior to this series) and the two other areas
can be updated by the hypervisor at any time. Once one such area is
registered within a certain page, if another such area happens to live in
the same page, copying the entire page again would overwrite all updates
that might already have been made for the first area. IOW copying ought
to - imo - happen exactly once, when the new page is allocated.

As to map_vcpu_info() - just look at the function: It writes to the newly
registered area. Even if the function name says just "map", that's an
integral part of the operation. We can't just map it, but leave the area
untouched.

Jan
Roger Pau Monne Sept. 28, 2023, 2:08 p.m. UTC | #5
On Thu, Sep 28, 2023 at 08:57:04AM -0400, Tamas K Lengyel wrote:
> On Thu, Sep 28, 2023 at 7:08 AM Roger Pau Monné <roger.pau@citrix.com> wrote:
> >
> > On Thu, Sep 28, 2023 at 12:11:02PM +0200, Jan Beulich wrote:
> > > On 28.09.2023 11:51, Roger Pau Monné wrote:
> > > > On Thu, Sep 28, 2023 at 09:16:20AM +0200, Jan Beulich wrote:
> > > >> --- a/xen/arch/x86/mm/mem_sharing.c
> > > >> +++ b/xen/arch/x86/mm/mem_sharing.c
> > > >> @@ -1641,6 +1641,68 @@ static void copy_vcpu_nonreg_state(struc
> > > >>      hvm_set_nonreg_state(cd_vcpu, &nrs);
> > > >>  }
> > > >>
> > > >> +static int copy_guest_area(struct guest_area *cd_area,
> > > >> +                           const struct guest_area *d_area,
> > > >> +                           struct vcpu *cd_vcpu,
> > > >> +                           const struct domain *d)
> > > >> +{
> > > >> +    mfn_t d_mfn, cd_mfn;
> > > >> +
> > > >> +    if ( !d_area->pg )
> > > >> +        return 0;
> > > >> +
> > > >> +    d_mfn = page_to_mfn(d_area->pg);
> > > >> +
> > > >> +    /* Allocate & map a page for the area if it hasn't been already. */
> > > >> +    if ( !cd_area->pg )
> > > >> +    {
> > > >> +        gfn_t gfn = mfn_to_gfn(d, d_mfn);
> > > >> +        struct p2m_domain *p2m = p2m_get_hostp2m(cd_vcpu->domain);
> > > >> +        p2m_type_t p2mt;
> > > >> +        p2m_access_t p2ma;
> > > >> +        unsigned int offset;
> > > >> +        int ret;
> > > >> +
> > > >> +        cd_mfn = p2m->get_entry(p2m, gfn, &p2mt, &p2ma, 0, NULL, NULL);
> > > >> +        if ( mfn_eq(cd_mfn, INVALID_MFN) )
> > > >> +        {
> > > >> +            struct page_info *pg = alloc_domheap_page(cd_vcpu->domain, 0);
> > > >> +
> > > >> +            if ( !pg )
> > > >> +                return -ENOMEM;
> > > >> +
> > > >> +            cd_mfn = page_to_mfn(pg);
> > > >> +            set_gpfn_from_mfn(mfn_x(cd_mfn), gfn_x(gfn));
> > > >> +
> > > >> +            ret = p2m->set_entry(p2m, gfn, cd_mfn, PAGE_ORDER_4K, p2m_ram_rw,
> > > >> +                                 p2m->default_access, -1);
> > > >> +            if ( ret )
> > > >> +                return ret;
> > > >> +        }
> > > >
> > > > I'm still unsure why map_guest_area() shouldn't be able to deal with a
> > > > forked child needing the page to be mapped.  What happens when a
> > > > forked child executes the hypercall to map such areas against not yet
> > > > populated gfns?
> > > >
> > > > Shouldn't map_guest_area() be capable of handling those calls and
> > > > populating on demand?
> > >
> > > Funny you should use this wording: P2M_ALLOC will deal with populating
> > > PoD slots, yes, but aiui forks don't fill their p2m with all PoD slots,
> > > but rather leave them empty. Yet again I need to leave it to Tamas to
> > > confirm or prove me wrong.
> >
> > If the child p2m populating is only triggered by guest accesses then a
> > lot of hypercalls are likely to not work correctly on childs.
> 
> That's not what's happening. As I said before, ALL access paths, be
> that from the guest, dom0 or Xen trigger page population. If there is
> a hole and P2M_ALLOC is set we do the following in
> p2m_get_gfn_type_access:
> 
>     /* Check if we need to fork the page */
>     if ( (q & P2M_ALLOC) && p2m_is_hole(*t) &&
>          !mem_sharing_fork_page(p2m->domain, gfn, q & P2M_UNSHARE) )
>         mfn = p2m->get_entry(p2m, gfn, t, a, q, page_order, NULL);
> 
> Depending on the type of access we either populate the hole with a
> shared memory entry or a copy.

Then the code here is redundant, as the call to get_page_from_gfn(...,
P2M_UNSHARE) in map_vcpu_info() will already take care of populating
the child p2m and copy the parents page contents?

Thanks, Roger.
Roger Pau Monne Sept. 29, 2023, 2:17 p.m. UTC | #6
On Fri, Sep 29, 2023 at 08:31:58AM -0400, Tamas K Lengyel wrote:
> On Thu, Sep 28, 2023 at 10:08 AM Roger Pau Monné <roger.pau@citrix.com> wrote:
> >
> > On Thu, Sep 28, 2023 at 08:57:04AM -0400, Tamas K Lengyel wrote:
> > > On Thu, Sep 28, 2023 at 7:08 AM Roger Pau Monné <roger.pau@citrix.com> wrote:
> > > >
> > > > On Thu, Sep 28, 2023 at 12:11:02PM +0200, Jan Beulich wrote:
> > > > > On 28.09.2023 11:51, Roger Pau Monné wrote:
> > > > > > On Thu, Sep 28, 2023 at 09:16:20AM +0200, Jan Beulich wrote:
> > > > > >> --- a/xen/arch/x86/mm/mem_sharing.c
> > > > > >> +++ b/xen/arch/x86/mm/mem_sharing.c
> > > > > >> @@ -1641,6 +1641,68 @@ static void copy_vcpu_nonreg_state(struc
> > > > > >>      hvm_set_nonreg_state(cd_vcpu, &nrs);
> > > > > >>  }
> > > > > >>
> > > > > >> +static int copy_guest_area(struct guest_area *cd_area,
> > > > > >> +                           const struct guest_area *d_area,
> > > > > >> +                           struct vcpu *cd_vcpu,
> > > > > >> +                           const struct domain *d)
> > > > > >> +{
> > > > > >> +    mfn_t d_mfn, cd_mfn;
> > > > > >> +
> > > > > >> +    if ( !d_area->pg )
> > > > > >> +        return 0;
> > > > > >> +
> > > > > >> +    d_mfn = page_to_mfn(d_area->pg);
> > > > > >> +
> > > > > >> +    /* Allocate & map a page for the area if it hasn't been already. */
> > > > > >> +    if ( !cd_area->pg )
> > > > > >> +    {
> > > > > >> +        gfn_t gfn = mfn_to_gfn(d, d_mfn);
> > > > > >> +        struct p2m_domain *p2m = p2m_get_hostp2m(cd_vcpu->domain);
> > > > > >> +        p2m_type_t p2mt;
> > > > > >> +        p2m_access_t p2ma;
> > > > > >> +        unsigned int offset;
> > > > > >> +        int ret;
> > > > > >> +
> > > > > >> +        cd_mfn = p2m->get_entry(p2m, gfn, &p2mt, &p2ma, 0, NULL, NULL);
> > > > > >> +        if ( mfn_eq(cd_mfn, INVALID_MFN) )
> > > > > >> +        {
> > > > > >> +            struct page_info *pg = alloc_domheap_page(cd_vcpu->domain, 0);
> > > > > >> +
> > > > > >> +            if ( !pg )
> > > > > >> +                return -ENOMEM;
> > > > > >> +
> > > > > >> +            cd_mfn = page_to_mfn(pg);
> > > > > >> +            set_gpfn_from_mfn(mfn_x(cd_mfn), gfn_x(gfn));
> > > > > >> +
> > > > > >> +            ret = p2m->set_entry(p2m, gfn, cd_mfn, PAGE_ORDER_4K, p2m_ram_rw,
> > > > > >> +                                 p2m->default_access, -1);
> > > > > >> +            if ( ret )
> > > > > >> +                return ret;
> > > > > >> +        }
> > > > > >
> > > > > > I'm still unsure why map_guest_area() shouldn't be able to deal with a
> > > > > > forked child needing the page to be mapped.  What happens when a
> > > > > > forked child executes the hypercall to map such areas against not yet
> > > > > > populated gfns?
> > > > > >
> > > > > > Shouldn't map_guest_area() be capable of handling those calls and
> > > > > > populating on demand?
> > > > >
> > > > > Funny you should use this wording: P2M_ALLOC will deal with populating
> > > > > PoD slots, yes, but aiui forks don't fill their p2m with all PoD slots,
> > > > > but rather leave them empty. Yet again I need to leave it to Tamas to
> > > > > confirm or prove me wrong.
> > > >
> > > > If the child p2m populating is only triggered by guest accesses then a
> > > > lot of hypercalls are likely to not work correctly on childs.
> > >
> > > That's not what's happening. As I said before, ALL access paths, be
> > > that from the guest, dom0 or Xen trigger page population. If there is
> > > a hole and P2M_ALLOC is set we do the following in
> > > p2m_get_gfn_type_access:
> > >
> > >     /* Check if we need to fork the page */
> > >     if ( (q & P2M_ALLOC) && p2m_is_hole(*t) &&
> > >          !mem_sharing_fork_page(p2m->domain, gfn, q & P2M_UNSHARE) )
> > >         mfn = p2m->get_entry(p2m, gfn, t, a, q, page_order, NULL);
> > >
> > > Depending on the type of access we either populate the hole with a
> > > shared memory entry or a copy.
> >
> > Then the code here is redundant, as the call to get_page_from_gfn(...,
> > P2M_UNSHARE) in map_vcpu_info() will already take care of populating
> > the child p2m and copy the parents page contents?
> 
> Reading the code now, yes, calling map_vcpu_info() would take care of
> populating the page for us as well so we can remove that code from
> mem_sharing.

Thanks for confirming.  Will try to prepare a patch next week to get
rid of the explicit child p2m populate for the vcpu_info page, and
hopefully simply the code here also as a result.

Roger.
Jan Beulich Oct. 16, 2023, 9:50 a.m. UTC | #7
On 29.09.2023 15:01, Tamas K Lengyel wrote:
> On Thu, Sep 28, 2023 at 9:19 AM Jan Beulich <jbeulich@suse.com> wrote:
>>
>> On 28.09.2023 14:57, Tamas K Lengyel wrote:
>>> On Thu, Sep 28, 2023 at 7:08 AM Roger Pau Monné <roger.pau@citrix.com> wrote:
>>>> On Thu, Sep 28, 2023 at 12:11:02PM +0200, Jan Beulich wrote:
>>>>> On 28.09.2023 11:51, Roger Pau Monné wrote:
>>>>>> On Thu, Sep 28, 2023 at 09:16:20AM +0200, Jan Beulich wrote:
>>>>>>> +        /*
>>>>>>> +         * Map the area into the guest. For simplicity specify the entire range
>>>>>>> +         * up to the end of the page: All the function uses it for is to check
>>>>>>> +         * that the range doesn't cross page boundaries. Having the area mapped
>>>>>>> +         * in the original domain implies that it fits there and therefore will
>>>>>>> +         * also fit in the clone.
>>>>>>> +         */
>>>>>>> +        offset = PAGE_OFFSET(d_area->map);
>>>>>>> +        ret = map_guest_area(cd_vcpu, gfn_to_gaddr(gfn) + offset,
>>>>>>> +                             PAGE_SIZE - offset, cd_area, NULL);
>>>>>>> +        if ( ret )
>>>>>>> +            return ret;
>>>>>>> +    }
>>>>>>> +    else
>>>>>>> +        cd_mfn = page_to_mfn(cd_area->pg);
>>>>>>> +
>>>>>>> +    copy_domain_page(cd_mfn, d_mfn);
>>>>>>
>>>>>> I think the page copy should be done only once, when the page is
>>>>>> populated on the child p2m.  Otherwise areas smaller than a page size
>>>>>> (like vpcu_time_info_t) that share the same page will get multiple
>>>>>> copies of the same data for no reason.
>>>>>
>>>>> I think you're right, but this would then be another issue in the original
>>>>> code that I merely didn't spot (and it's not just "copy for no reason",
>>>>> we'd actually corrupt what was put there before). IOW the copying needs to
>>>>> move ahead of map_guest_area() (or yet more precisely after the error
>>>>> checking for p2m->set_entry()), and in the original code it would have
>>>>> needed to live ahead of map_vcpu_info(). Once again I'd like Tamas to
>>>>> confirm (or otherwise) before making that change, though.
>>>>
>>>> Yes, it's already an issue in the current code.  I wonder whether
>>>> logic in the guest or Xen could malfunctions due to the fact that
>>>> map_vcpu_info() unconditionally sets evtchn_upcall_pending and injects
>>>> an event channel upcall, but the later call to copy_domain_page()
>>>> might unset evtchn_upcall_pending while the vector is already injected.
>>>
>>> Sorry but I really don't follow the discussion here. My understanding
>>> was that map_vcpu_info, as its name suggests, maps the page. We use it
>>> to map a new page into that position in case the fork hasn't set it up
>>> yet but the parent has one. Then we follow with the copy from the
>>> parent so the page content is matching. If there is already a
>>> vcpu_info page in the fork, we just do the copy.
>>>
>>> Now, if map_vcpu_info does more then mapping, then I don't know what
>>> it does, why it does it, and what happens if we skip it when the fork
>>> is reset for example. Is the suggestion to call it map_vcpu_info every
>>> time the page content is reset (ie after the copy)?
>>
>> The vCPU info area (already prior to this series) and the two other areas
>> can be updated by the hypervisor at any time. Once one such area is
>> registered within a certain page, if another such area happens to live in
>> the same page, copying the entire page again would overwrite all updates
>> that might already have been made for the first area. IOW copying ought
>> to - imo - happen exactly once, when the new page is allocated.
>>
>> As to map_vcpu_info() - just look at the function: It writes to the newly
>> registered area. Even if the function name says just "map", that's an
>> integral part of the operation. We can't just map it, but leave the area
>> untouched.
> 
> The domains are paused (both the parent and the child) when a new fork
> is being made and also during fork reset. If Xen really does write to
> these memory areas "any time", even if the domain is paused, we will
> need a lock to tell Xen not touch it cause it hasn't finished being
> constructed/reset yet.

It's not really "any" time, but "any time the vCPU is (about to) run(ning)".

> Moreover, not sure what Xen is writing to these
> areas, but if it's anything "stateful" it should be discarded or
> copied from the parent because the guest state must match the parent
> after the fork/reset op.

How that? Both are running independently once unpaused, and the data
written is derived from various bits of system and guest state. I see
no reason why the initial populating would need to clone the parent's
when immediately afterwards the data would be overwritten by child-
specific values.

Jan
diff mbox series

Patch

--- a/xen/arch/x86/mm/mem_sharing.c
+++ b/xen/arch/x86/mm/mem_sharing.c
@@ -1641,6 +1641,68 @@  static void copy_vcpu_nonreg_state(struc
     hvm_set_nonreg_state(cd_vcpu, &nrs);
 }
 
+static int copy_guest_area(struct guest_area *cd_area,
+                           const struct guest_area *d_area,
+                           struct vcpu *cd_vcpu,
+                           const struct domain *d)
+{
+    mfn_t d_mfn, cd_mfn;
+
+    if ( !d_area->pg )
+        return 0;
+
+    d_mfn = page_to_mfn(d_area->pg);
+
+    /* Allocate & map a page for the area if it hasn't been already. */
+    if ( !cd_area->pg )
+    {
+        gfn_t gfn = mfn_to_gfn(d, d_mfn);
+        struct p2m_domain *p2m = p2m_get_hostp2m(cd_vcpu->domain);
+        p2m_type_t p2mt;
+        p2m_access_t p2ma;
+        unsigned int offset;
+        int ret;
+
+        cd_mfn = p2m->get_entry(p2m, gfn, &p2mt, &p2ma, 0, NULL, NULL);
+        if ( mfn_eq(cd_mfn, INVALID_MFN) )
+        {
+            struct page_info *pg = alloc_domheap_page(cd_vcpu->domain, 0);
+
+            if ( !pg )
+                return -ENOMEM;
+
+            cd_mfn = page_to_mfn(pg);
+            set_gpfn_from_mfn(mfn_x(cd_mfn), gfn_x(gfn));
+
+            ret = p2m->set_entry(p2m, gfn, cd_mfn, PAGE_ORDER_4K, p2m_ram_rw,
+                                 p2m->default_access, -1);
+            if ( ret )
+                return ret;
+        }
+        else if ( p2mt != p2m_ram_rw )
+            return -EBUSY;
+
+        /*
+         * Map the area into the guest. For simplicity specify the entire range
+         * up to the end of the page: All the function uses it for is to check
+         * that the range doesn't cross page boundaries. Having the area mapped
+         * in the original domain implies that it fits there and therefore will
+         * also fit in the clone.
+         */
+        offset = PAGE_OFFSET(d_area->map);
+        ret = map_guest_area(cd_vcpu, gfn_to_gaddr(gfn) + offset,
+                             PAGE_SIZE - offset, cd_area, NULL);
+        if ( ret )
+            return ret;
+    }
+    else
+        cd_mfn = page_to_mfn(cd_area->pg);
+
+    copy_domain_page(cd_mfn, d_mfn);
+
+    return 0;
+}
+
 static int copy_vpmu(struct vcpu *d_vcpu, struct vcpu *cd_vcpu)
 {
     struct vpmu_struct *d_vpmu = vcpu_vpmu(d_vcpu);
@@ -1733,6 +1795,16 @@  static int copy_vcpu_settings(struct dom
             copy_domain_page(new_vcpu_info_mfn, vcpu_info_mfn);
         }
 
+        /* Same for the (physically registered) runstate and time info areas. */
+        ret = copy_guest_area(&cd_vcpu->runstate_guest_area,
+                              &d_vcpu->runstate_guest_area, cd_vcpu, d);
+        if ( ret )
+            return ret;
+        ret = copy_guest_area(&cd_vcpu->arch.time_guest_area,
+                              &d_vcpu->arch.time_guest_area, cd_vcpu, d);
+        if ( ret )
+            return ret;
+
         ret = copy_vpmu(d_vcpu, cd_vcpu);
         if ( ret )
             return ret;
@@ -1974,7 +2046,10 @@  int mem_sharing_fork_reset(struct domain
 
  state:
     if ( reset_state )
+    {
         rc = copy_settings(d, pd);
+        /* TBD: What to do here with -ERESTART? */
+    }
 
     domain_unpause(d);
 
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -1601,6 +1601,13 @@  void unmap_vcpu_info(struct vcpu *v)
     put_page_and_type(mfn_to_page(mfn));
 }
 
+int map_guest_area(struct vcpu *v, paddr_t gaddr, unsigned int size,
+                   struct guest_area *area,
+                   void (*populate)(void *dst, struct vcpu *v))
+{
+    return -EOPNOTSUPP;
+}
+
 /*
  * This is only intended to be used for domain cleanup (or more generally only
  * with at least the respective vCPU, if it's not the current one, reliably