diff mbox series

[v11,2/3] x86/mem_sharing: reset a fork

Message ID b76a2a71bdbb26e57088dab8f7c3966432aed729.1582914998.git.tamas.lengyel@intel.com (mailing list archive)
State Superseded
Headers show
Series VM forking | expand

Commit Message

Tamas K Lengyel Feb. 28, 2020, 6:40 p.m. UTC
Implement hypercall that allows a fork to shed all memory that got allocated
for it during its execution and re-load its vCPU context from the parent VM.
This allows the forked VM to reset into the same state the parent VM is in a
faster way then creating a new fork would be. Measurements show about a 2x
speedup during normal fuzzing operations. Performance may vary depending how
much memory got allocated for the forked VM. If it has been completely
deduplicated from the parent VM then creating a new fork would likely be more
performant.

Signed-off-by: Tamas K Lengyel <tamas.lengyel@intel.com>
---
v11: use copy_settings function to reset special pages
---
 xen/arch/x86/mm/mem_sharing.c | 115 ++++++++++++++++++++++++++++++++++
 xen/include/public/memory.h   |   4 ++
 2 files changed, 119 insertions(+)

Comments

Jan Beulich March 18, 2020, 11:36 a.m. UTC | #1
On 28.02.2020 19:40, Tamas K Lengyel wrote:
> --- a/xen/arch/x86/mm/mem_sharing.c
> +++ b/xen/arch/x86/mm/mem_sharing.c
> @@ -1775,6 +1775,91 @@ static int fork(struct domain *cd, struct domain *d)
>       return rc;
>   }
>   
> +/*
> + * The fork reset operation is intended to be used on short-lived forks only.
> + */
> +static int fork_reset(struct domain *d, struct domain *cd,

Could I talk you into using pd instead of d, to even more
clearly distinguish which of the two domain's is meant? Also
in principle this might be possible to be a pointer to const,
albeit I realize this may need changes you likely don't want
to do in a prereq patch (and maybe there's actually a reason
why it can't be).

> +                      struct mem_sharing_op_fork_reset *fr)
> +{
> +    int rc = 0;
> +    struct p2m_domain* p2m = p2m_get_hostp2m(cd);

Star and blank want to switch places here.

> +    struct page_info *page, *tmp;
> +    unsigned long list_position = 0, preempt_count = 0, restart = fr->opaque;
> +
> +    domain_pause(cd);
> +
> +    page_list_for_each_safe(page, tmp, &cd->page_list)

You may not iterate a domain's page list without holding its
page-alloc lock. Even if the domain is paused, other entities
(like the controlling domain) may cause the list to be altered.
With this the question then of course becomes whether holding
that lock for this long is acceptable. I guess you need to
somehow mark the pages you've processed, either by a flag or
by moving between separate lists. Domain cleanup does something
along these lines.

> +    {
> +        p2m_type_t p2mt;
> +        p2m_access_t p2ma;
> +        gfn_t gfn;
> +        mfn_t mfn;
> +        bool shared = false;
> +
> +        list_position++;
> +
> +        /* Resume were we left of before preemption */
> +        if ( restart && list_position < restart )
> +            continue;

This assumes the list to not have been changed across a continuation,
which isn't going to fly.

> +        mfn = page_to_mfn(page);
> +        if ( mfn_valid(mfn) )

All pages on a domain's list should have a valid MFN - what are you
trying to protect against here?

> +        {
> +
> +            gfn = mfn_to_gfn(cd, mfn);

Stray blank line above here?

> +            mfn = __get_gfn_type_access(p2m, gfn_x(gfn), &p2mt, &p2ma,
> +                                        0, NULL, false);
> +
> +            if ( p2m_is_ram(p2mt) && !p2m_is_shared(p2mt) )
> +            {
> +                /* take an extra reference, must work for a shared page */

The comment (and also the next one further down) looks contradictory
to the if() immediately ahead, at least to me. Could you clarify the
situation, please?

> +                if( !get_page(page, cd) )
> +                {
> +                    ASSERT_UNREACHABLE();
> +                    return -EINVAL;
> +                }
> +
> +                shared = true;
> +                preempt_count += 0x10;
> +
> +                /*
> +                 * Must succeed, it's a shared page that exists and
> +                 * thus its size is guaranteed to be 4k so we are not splitting
> +                 * large pages.
> +                 */
> +                rc = p2m->set_entry(p2m, gfn, INVALID_MFN, PAGE_ORDER_4K,
> +                                    p2m_invalid, p2m_access_rwx, -1);
> +                ASSERT(!rc);
> +
> +                put_page_alloc_ref(page);
> +                put_page(page);
> +            }
> +        }
> +
> +        if ( !shared )
> +            preempt_count++;
> +
> +        /* Preempt every 2MiB (shared) or 32MiB (unshared) - arbitrary. */
> +        if ( preempt_count >= 0x2000 )
> +        {
> +            if ( hypercall_preempt_check() )
> +            {
> +                rc = -ERESTART;

You use a negative return value here, but ...

> +                break;
> +            }
> +            preempt_count = 0;
> +        }
> +    }
> +
> +    if ( rc )
> +        fr->opaque = list_position;
> +    else
> +        rc = copy_settings(cd, d);
> +
> +    domain_unpause(cd);
> +    return rc;
> +}
> +
>   int mem_sharing_memop(XEN_GUEST_HANDLE_PARAM(xen_mem_sharing_op_t) arg)
>   {
>       int rc;
> @@ -2066,6 +2151,36 @@ int mem_sharing_memop(XEN_GUEST_HANDLE_PARAM(xen_mem_sharing_op_t) arg)
>           break;
>       }
>   
> +    case XENMEM_sharing_op_fork_reset:
> +    {
> +        struct domain *pd;
> +
> +        rc = -ENOSYS;
> +        if ( !mem_sharing_is_fork(d) )
> +            goto out;
> +
> +        rc = rcu_lock_live_remote_domain_by_id(d->parent->domain_id, &pd);
> +        if ( rc )
> +            goto out;
> +
> +        rc = fork_reset(pd, d, &mso.u.fork_reset);
> +
> +        rcu_unlock_domain(pd);
> +
> +        if ( rc > 0 )

... you check for a positive value here. I didn't get around to
look at earlier versions, so I can only guess the -ERESTART above
was changed to later on.

Jan
Tamas K Lengyel March 18, 2020, 2 p.m. UTC | #2
On Wed, Mar 18, 2020 at 5:36 AM Jan Beulich <jbeulich@suse.com> wrote:
>
> On 28.02.2020 19:40, Tamas K Lengyel wrote:
> > --- a/xen/arch/x86/mm/mem_sharing.c
> > +++ b/xen/arch/x86/mm/mem_sharing.c
> > @@ -1775,6 +1775,91 @@ static int fork(struct domain *cd, struct domain *d)
> >       return rc;
> >   }
> >
> > +/*
> > + * The fork reset operation is intended to be used on short-lived forks only.
> > + */
> > +static int fork_reset(struct domain *d, struct domain *cd,
>
> Could I talk you into using pd instead of d, to even more
> clearly distinguish which of the two domain's is meant? Also
> in principle this might be possible to be a pointer to const,
> albeit I realize this may need changes you likely don't want
> to do in a prereq patch (and maybe there's actually a reason
> why it can't be).

The names c and cd are used across the mem_sharing codebase, for
consistency I'm keeping that.

>
> > +                      struct mem_sharing_op_fork_reset *fr)
> > +{
> > +    int rc = 0;
> > +    struct p2m_domain* p2m = p2m_get_hostp2m(cd);
>
> Star and blank want to switch places here.
>
> > +    struct page_info *page, *tmp;
> > +    unsigned long list_position = 0, preempt_count = 0, restart = fr->opaque;
> > +
> > +    domain_pause(cd);
> > +
> > +    page_list_for_each_safe(page, tmp, &cd->page_list)
>
> You may not iterate a domain's page list without holding its
> page-alloc lock. Even if the domain is paused, other entities
> (like the controlling domain) may cause the list to be altered.
> With this the question then of course becomes whether holding
> that lock for this long is acceptable. I guess you need to
> somehow mark the pages you've processed, either by a flag or
> by moving between separate lists. Domain cleanup does something
> along these lines.
>
> > +    {
> > +        p2m_type_t p2mt;
> > +        p2m_access_t p2ma;
> > +        gfn_t gfn;
> > +        mfn_t mfn;
> > +        bool shared = false;
> > +
> > +        list_position++;
> > +
> > +        /* Resume were we left of before preemption */
> > +        if ( restart && list_position < restart )
> > +            continue;
>
> This assumes the list to not have been changed across a continuation,
> which isn't going to fly.


OK, I'm going to drop continuation here completely. I was reluctant to
add it to begin with since this hypercall should only be called when
the number of pages is low so there wouldn't be continuation anyway.
This is work I'm unable to assign more time for, if someone in the
future really needs continuation they are welcome to figure it out.

>
> > +        mfn = page_to_mfn(page);
> > +        if ( mfn_valid(mfn) )
>
> All pages on a domain's list should have a valid MFN - what are you
> trying to protect against here?

I saw no documentation stating what you stated above. If that's the
case it can be dropped.

>
> > +        {
> > +
> > +            gfn = mfn_to_gfn(cd, mfn);
>
> Stray blank line above here?
>
> > +            mfn = __get_gfn_type_access(p2m, gfn_x(gfn), &p2mt, &p2ma,
> > +                                        0, NULL, false);
> > +
> > +            if ( p2m_is_ram(p2mt) && !p2m_is_shared(p2mt) )
> > +            {
> > +                /* take an extra reference, must work for a shared page */
>
> The comment (and also the next one further down) looks contradictory
> to the if() immediately ahead, at least to me. Could you clarify the
> situation, please?

I don't understand your question.  The comment explains exactly what
happens. Taking an extra reference must work. If it didn't, trigger an
ASSERT_UNREACHABLE. Which part is confusing?

>
> > +                if( !get_page(page, cd) )
> > +                {
> > +                    ASSERT_UNREACHABLE();
> > +                    return -EINVAL;
> > +                }
> > +
> > +                shared = true;
> > +                preempt_count += 0x10;
> > +
> > +                /*
> > +                 * Must succeed, it's a shared page that exists and
> > +                 * thus its size is guaranteed to be 4k so we are not splitting
> > +                 * large pages.
> > +                 */
> > +                rc = p2m->set_entry(p2m, gfn, INVALID_MFN, PAGE_ORDER_4K,
> > +                                    p2m_invalid, p2m_access_rwx, -1);
> > +                ASSERT(!rc);
> > +
> > +                put_page_alloc_ref(page);
> > +                put_page(page);
> > +            }
> > +        }
> > +
> > +        if ( !shared )
> > +            preempt_count++;
> > +
> > +        /* Preempt every 2MiB (shared) or 32MiB (unshared) - arbitrary. */
> > +        if ( preempt_count >= 0x2000 )
> > +        {
> > +            if ( hypercall_preempt_check() )
> > +            {
> > +                rc = -ERESTART;
>
> You use a negative return value here, but ...
>
> > +                break;
> > +            }
> > +            preempt_count = 0;
> > +        }
> > +    }
> > +
> > +    if ( rc )
> > +        fr->opaque = list_position;
> > +    else
> > +        rc = copy_settings(cd, d);
> > +
> > +    domain_unpause(cd);
> > +    return rc;
> > +}
> > +
> >   int mem_sharing_memop(XEN_GUEST_HANDLE_PARAM(xen_mem_sharing_op_t) arg)
> >   {
> >       int rc;
> > @@ -2066,6 +2151,36 @@ int mem_sharing_memop(XEN_GUEST_HANDLE_PARAM(xen_mem_sharing_op_t) arg)
> >           break;
> >       }
> >
> > +    case XENMEM_sharing_op_fork_reset:
> > +    {
> > +        struct domain *pd;
> > +
> > +        rc = -ENOSYS;
> > +        if ( !mem_sharing_is_fork(d) )
> > +            goto out;
> > +
> > +        rc = rcu_lock_live_remote_domain_by_id(d->parent->domain_id, &pd);
> > +        if ( rc )
> > +            goto out;
> > +
> > +        rc = fork_reset(pd, d, &mso.u.fork_reset);
> > +
> > +        rcu_unlock_domain(pd);
> > +
> > +        if ( rc > 0 )
>
> ... you check for a positive value here. I didn't get around to
> look at earlier versions, so I can only guess the -ERESTART above
> was changed to later on.

I'm dropping continuation for the next revision so this no longer matters.

Thanks,
Tamas
Jan Beulich March 18, 2020, 2:12 p.m. UTC | #3
On 18.03.2020 15:00, Tamas K Lengyel wrote:
> On Wed, Mar 18, 2020 at 5:36 AM Jan Beulich <jbeulich@suse.com> wrote:
>> On 28.02.2020 19:40, Tamas K Lengyel wrote:
>>> +        mfn = page_to_mfn(page);
>>> +        if ( mfn_valid(mfn) )
>>
>> All pages on a domain's list should have a valid MFN - what are you
>> trying to protect against here?
> 
> I saw no documentation stating what you stated above. If that's the
> case it can be dropped.

Only pages coming from the allocator (or, in some special cases,
otherwise valid) get put on a domain's page list. By coming from
the allocator their MFNs are impicitly valid.

>>> +            mfn = __get_gfn_type_access(p2m, gfn_x(gfn), &p2mt, &p2ma,
>>> +                                        0, NULL, false);
>>> +
>>> +            if ( p2m_is_ram(p2mt) && !p2m_is_shared(p2mt) )
>>> +            {
>>> +                /* take an extra reference, must work for a shared page */
>>
>> The comment (and also the next one further down) looks contradictory
>> to the if() immediately ahead, at least to me. Could you clarify the
>> situation, please?
> 
> I don't understand your question.  The comment explains exactly what
> happens. Taking an extra reference must work. If it didn't, trigger an
> ASSERT_UNREACHABLE. Which part is confusing?

The comment says "a shared page" whereas the condition includes
"!p2m_is_shared(p2mt)", which I understand to mean a page which is
not shared.

As to you dropping continuations again - please have at least a
bold comment clarifying that their addition is a requirement for
the code to ever reach "supported" status. (Any other obvious but
intentional omissions could also be named there.)

Jan
Tamas K Lengyel March 18, 2020, 3:13 p.m. UTC | #4
On Wed, Mar 18, 2020 at 8:13 AM Jan Beulich <jbeulich@suse.com> wrote:
>
> On 18.03.2020 15:00, Tamas K Lengyel wrote:
> > On Wed, Mar 18, 2020 at 5:36 AM Jan Beulich <jbeulich@suse.com> wrote:
> >> On 28.02.2020 19:40, Tamas K Lengyel wrote:
> >>> +        mfn = page_to_mfn(page);
> >>> +        if ( mfn_valid(mfn) )
> >>
> >> All pages on a domain's list should have a valid MFN - what are you
> >> trying to protect against here?
> >
> > I saw no documentation stating what you stated above. If that's the
> > case it can be dropped.
>
> Only pages coming from the allocator (or, in some special cases,
> otherwise valid) get put on a domain's page list. By coming from
> the allocator their MFNs are impicitly valid.
>
> >>> +            mfn = __get_gfn_type_access(p2m, gfn_x(gfn), &p2mt, &p2ma,
> >>> +                                        0, NULL, false);
> >>> +
> >>> +            if ( p2m_is_ram(p2mt) && !p2m_is_shared(p2mt) )
> >>> +            {
> >>> +                /* take an extra reference, must work for a shared page */
> >>
> >> The comment (and also the next one further down) looks contradictory
> >> to the if() immediately ahead, at least to me. Could you clarify the
> >> situation, please?
> >
> > I don't understand your question.  The comment explains exactly what
> > happens. Taking an extra reference must work. If it didn't, trigger an
> > ASSERT_UNREACHABLE. Which part is confusing?
>
> The comment says "a shared page" whereas the condition includes
> "!p2m_is_shared(p2mt)", which I understand to mean a page which is
> not shared.
>
> As to you dropping continuations again - please have at least a
> bold comment clarifying that their addition is a requirement for
> the code to ever reach "supported" status. (Any other obvious but
> intentional omissions could also be named there.)
>

Sure, I had that comment in place before. There are no plans to have
this code be "supported", we are fine with it being experimental.

Tamas
diff mbox series

Patch

diff --git a/xen/arch/x86/mm/mem_sharing.c b/xen/arch/x86/mm/mem_sharing.c
index 4a840d09dd..a458096b01 100644
--- a/xen/arch/x86/mm/mem_sharing.c
+++ b/xen/arch/x86/mm/mem_sharing.c
@@ -1775,6 +1775,91 @@  static int fork(struct domain *cd, struct domain *d)
     return rc;
 }
 
+/*
+ * The fork reset operation is intended to be used on short-lived forks only.
+ */
+static int fork_reset(struct domain *d, struct domain *cd,
+                      struct mem_sharing_op_fork_reset *fr)
+{
+    int rc = 0;
+    struct p2m_domain* p2m = p2m_get_hostp2m(cd);
+    struct page_info *page, *tmp;
+    unsigned long list_position = 0, preempt_count = 0, restart = fr->opaque;
+
+    domain_pause(cd);
+
+    page_list_for_each_safe(page, tmp, &cd->page_list)
+    {
+        p2m_type_t p2mt;
+        p2m_access_t p2ma;
+        gfn_t gfn;
+        mfn_t mfn;
+        bool shared = false;
+
+        list_position++;
+
+        /* Resume were we left of before preemption */
+        if ( restart && list_position < restart )
+            continue;
+
+        mfn = page_to_mfn(page);
+        if ( mfn_valid(mfn) )
+        {
+
+            gfn = mfn_to_gfn(cd, mfn);
+            mfn = __get_gfn_type_access(p2m, gfn_x(gfn), &p2mt, &p2ma,
+                                        0, NULL, false);
+
+            if ( p2m_is_ram(p2mt) && !p2m_is_shared(p2mt) )
+            {
+                /* take an extra reference, must work for a shared page */
+                if( !get_page(page, cd) )
+                {
+                    ASSERT_UNREACHABLE();
+                    return -EINVAL;
+                }
+
+                shared = true;
+                preempt_count += 0x10;
+
+                /*
+                 * Must succeed, it's a shared page that exists and
+                 * thus its size is guaranteed to be 4k so we are not splitting
+                 * large pages.
+                 */
+                rc = p2m->set_entry(p2m, gfn, INVALID_MFN, PAGE_ORDER_4K,
+                                    p2m_invalid, p2m_access_rwx, -1);
+                ASSERT(!rc);
+
+                put_page_alloc_ref(page);
+                put_page(page);
+            }
+        }
+
+        if ( !shared )
+            preempt_count++;
+
+        /* Preempt every 2MiB (shared) or 32MiB (unshared) - arbitrary. */
+        if ( preempt_count >= 0x2000 )
+        {
+            if ( hypercall_preempt_check() )
+            {
+                rc = -ERESTART;
+                break;
+            }
+            preempt_count = 0;
+        }
+    }
+
+    if ( rc )
+        fr->opaque = list_position;
+    else
+        rc = copy_settings(cd, d);
+
+    domain_unpause(cd);
+    return rc;
+}
+
 int mem_sharing_memop(XEN_GUEST_HANDLE_PARAM(xen_mem_sharing_op_t) arg)
 {
     int rc;
@@ -2066,6 +2151,36 @@  int mem_sharing_memop(XEN_GUEST_HANDLE_PARAM(xen_mem_sharing_op_t) arg)
         break;
     }
 
+    case XENMEM_sharing_op_fork_reset:
+    {
+        struct domain *pd;
+
+        rc = -ENOSYS;
+        if ( !mem_sharing_is_fork(d) )
+            goto out;
+
+        rc = rcu_lock_live_remote_domain_by_id(d->parent->domain_id, &pd);
+        if ( rc )
+            goto out;
+
+        rc = fork_reset(pd, d, &mso.u.fork_reset);
+
+        rcu_unlock_domain(pd);
+
+        if ( rc > 0 )
+        {
+            if ( __copy_to_guest(arg, &mso, 1) )
+                rc = -EFAULT;
+            else
+                rc = hypercall_create_continuation(__HYPERVISOR_memory_op,
+                                                   "lh", XENMEM_sharing_op,
+                                                   arg);
+        }
+        else
+            mso.u.fork_reset.opaque = 0;
+        break;
+    }
+
     default:
         rc = -ENOSYS;
         break;
diff --git a/xen/include/public/memory.h b/xen/include/public/memory.h
index c1dbad060e..7ca07c01dd 100644
--- a/xen/include/public/memory.h
+++ b/xen/include/public/memory.h
@@ -483,6 +483,7 @@  DEFINE_XEN_GUEST_HANDLE(xen_mem_access_op_t);
 #define XENMEM_sharing_op_audit             7
 #define XENMEM_sharing_op_range_share       8
 #define XENMEM_sharing_op_fork              9
+#define XENMEM_sharing_op_fork_reset        10
 
 #define XENMEM_SHARING_OP_S_HANDLE_INVALID  (-10)
 #define XENMEM_SHARING_OP_C_HANDLE_INVALID  (-9)
@@ -537,6 +538,9 @@  struct xen_mem_sharing_op {
             domid_t parent_domain;        /* IN: parent's domain id */
             uint16_t _pad[3];             /* Must be set to 0 */
         } fork;
+        struct mem_sharing_op_fork_reset {   /* OP_FORK_RESET */
+            uint64_aligned_t opaque;         /* Must be set to 0 */
+        } fork_reset;
     } u;
 };
 typedef struct xen_mem_sharing_op xen_mem_sharing_op_t;