diff mbox series

[1/5] xen/common: introduce a new framework for save/restore of 'domain' context

Message ID 20200327185012.1795-2-paul@xen.org (mailing list archive)
State Superseded
Headers show
Series domain context infrastructure | expand

Commit Message

Paul Durrant March 27, 2020, 6:50 p.m. UTC
Domain context is state held in the hypervisor that does not come under
the category of 'HVM state' but is instead 'PV state' that is common
between PV guests and enlightened HVM guests (i.e. those that have PV
drivers) such as event channel state, grant entry state, etc.

To allow enlightened HVM guests to be migrated without their co-operation
it will be necessary to transfer such state along with the domain's
memory image, architectural state, etc. This framework is introduced for
that purpose.

This patch adds the new public header and the low level implementation,
entered via the domain_save() or domain_load() functions. Subsequent
patches will introduce other parts of the framwork, and code that will
make use of it within the current version of the libxc migration stream.

Signed-off-by: Paul Durrant <paul@xen.org>
---
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
Cc: George Dunlap <george.dunlap@citrix.com>
Cc: Ian Jackson <ian.jackson@eu.citrix.com>
Cc: Jan Beulich <jbeulich@suse.com>
Cc: Julien Grall <julien@xen.org>
Cc: Stefano Stabellini <sstabellini@kernel.org>
Cc: Wei Liu <wl@xen.org>
---
 xen/common/Makefile       |   1 +
 xen/common/save.c         | 262 ++++++++++++++++++++++++++++++++++++++
 xen/include/public/save.h |  74 +++++++++++
 xen/include/xen/save.h    | 115 +++++++++++++++++
 4 files changed, 452 insertions(+)
 create mode 100644 xen/common/save.c
 create mode 100644 xen/include/public/save.h
 create mode 100644 xen/include/xen/save.h

Comments

Julien Grall April 1, 2020, noon UTC | #1
Hi Paul,

On 27/03/2020 18:50, Paul Durrant wrote:
> Domain context is state held in the hypervisor that does not come under
> the category of 'HVM state' but is instead 'PV state' that is common
> between PV guests and enlightened HVM guests (i.e. those that have PV
> drivers) such as event channel state, grant entry state, etc.
> 
> To allow enlightened HVM guests to be migrated without their co-operation
> it will be necessary to transfer such state along with the domain's
> memory image, architectural state, etc. This framework is introduced for
> that purpose.
> 
> This patch adds the new public header and the low level implementation,
> entered via the domain_save() or domain_load() functions. Subsequent
> patches will introduce other parts of the framwork, and code that will

Typo: s/framwork/framework/

> make use of it within the current version of the libxc migration stream.
> 
> Signed-off-by: Paul Durrant <paul@xen.org>
> ---
> Cc: Andrew Cooper <andrew.cooper3@citrix.com>
> Cc: George Dunlap <george.dunlap@citrix.com>
> Cc: Ian Jackson <ian.jackson@eu.citrix.com>
> Cc: Jan Beulich <jbeulich@suse.com>
> Cc: Julien Grall <julien@xen.org>
> Cc: Stefano Stabellini <sstabellini@kernel.org>
> Cc: Wei Liu <wl@xen.org>
> ---
>   xen/common/Makefile       |   1 +
>   xen/common/save.c         | 262 ++++++++++++++++++++++++++++++++++++++
>   xen/include/public/save.h |  74 +++++++++++
>   xen/include/xen/save.h    | 115 +++++++++++++++++
>   4 files changed, 452 insertions(+)
>   create mode 100644 xen/common/save.c
>   create mode 100644 xen/include/public/save.h
>   create mode 100644 xen/include/xen/save.h
> 
> diff --git a/xen/common/Makefile b/xen/common/Makefile
> index e8cde65370..90553ba5d7 100644
> --- a/xen/common/Makefile
> +++ b/xen/common/Makefile
> @@ -37,6 +37,7 @@ obj-y += radix-tree.o
>   obj-y += rbtree.o
>   obj-y += rcupdate.o
>   obj-y += rwlock.o
> +obj-y += save.o
>   obj-y += shutdown.o
>   obj-y += softirq.o
>   obj-y += sort.o
> diff --git a/xen/common/save.c b/xen/common/save.c
> new file mode 100644
> index 0000000000..bef99452d8
> --- /dev/null
> +++ b/xen/common/save.c
> @@ -0,0 +1,262 @@
> +/*
> + * save.c: Save and restore PV guest state common to all domain types.
> + *
> + * Copyright Amazon.com Inc. or its affiliates.
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms and conditions of the GNU General Public License,
> + * version 2, as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope it will be useful, but WITHOUT
> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
> + * more details.
> + *
> + * You should have received a copy of the GNU General Public License along with
> + * this program; If not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include <xen/save.h>
> +
> +struct domain_context {
> +    bool log;
> +    struct domain_save_descriptor desc;
> +    domain_copy_entry copy;

As your new framework is technically an extension of existing one, it 
would be good to explain why we diverge in the definitions.

> +    void *priv;
> +};
> +
> +static struct {
> +    const char *name;
> +    bool per_vcpu;
> +    domain_save_handler save;
> +    domain_load_handler load;
> +} handlers[DOMAIN_SAVE_CODE_MAX + 1];
> +
> +void __init domain_register_save_type(unsigned int tc, const char *name,
> +                                      bool per_vcpu,
> +                                      domain_save_handler save,
> +                                      domain_load_handler load)
> +{
> +    BUG_ON(tc > ARRAY_SIZE(handlers));
> +
> +    ASSERT(!handlers[tc].save);
> +    ASSERT(!handlers[tc].load);
> +
> +    handlers[tc].name = name;
> +    handlers[tc].per_vcpu = per_vcpu;
> +    handlers[tc].save = save;
> +    handlers[tc].load = load;
> +}
> +
> +int domain_save_entry(struct domain_context *c, unsigned int tc,
> +                      const char *name, const struct vcpu *v, void *src,

I realize that 'src' is not const because how you define copy, however I 
would rather prefer if we rework the interface to avoid to keep the 
const in the definition. This may mean to have to define two callback 
rather than one.

> +                      size_t src_len)

On 64-bit architecture, size_t would be 64-bit. But the record is only 
storing 32-bit. Would it make sense to add some sanity check in the code?

> +{
> +    int rc;
> +
> +    if ( c->log && tc != DOMAIN_SAVE_CODE(HEADER) &&
> +         tc != DOMAIN_SAVE_CODE(END) )
> +        gdprintk(XENLOG_INFO, "%pv save: %s (%lu)\n", v, name, src_len);
> +
> +    if ( !IS_ALIGNED(src_len, 8) )

Why not adding an implicit padding? This would avoid to chase error 
during saving because the len wasn't a multiple of 8.

> +        return -EINVAL;
> +
> +    BUG_ON(tc != c->desc.typecode);
> +    BUG_ON(v->vcpu_id != c->desc.instance);

Does the descriptor really need to be filled by domain_save()? Can't it 
be done here, so we can avoid the two BUG_ON()s?

> +    c->desc.length = src_len;
> +
> +    rc = c->copy(c->priv, &c->desc, sizeof(c->desc));
> +    if ( rc )
> +        return rc;
> +
> +    return c->copy(c->priv, src, src_len);
> +}
> +
> +int domain_save(struct domain *d, domain_copy_entry copy, void *priv,
> +                unsigned long mask, bool dry_run)
> +{
> +    struct domain_context c = {
> +        .copy = copy,
> +        .priv = priv,
> +        .log = !dry_run,
> +    };
> +    struct domain_save_header h = {
> +        .magic = DOMAIN_SAVE_MAGIC,
> +        .version = DOMAIN_SAVE_VERSION,
> +    };
> +    struct domain_save_header e;
> +    unsigned int i;
> +    int rc;
> +
> +    ASSERT(d != current->domain);
> +
> +    if ( d->is_dying )
> +        return -EINVAL;
> +
> +    domain_pause(d);
> +
> +    c.desc.typecode = DOMAIN_SAVE_CODE(HEADER);
> +
> +    rc = DOMAIN_SAVE_ENTRY(HEADER, &c, d->vcpu[0], &h, sizeof(h));
> +    if ( rc )
> +        goto out;
> +
> +    for ( i = 0; i < ARRAY_SIZE(handlers); i++ )

AFAIU, with this solution, if there are dependency between records, the 
dependencies will have to a lower "index". I think we may have some 
dependency with guest transparent migration such as we need to restore 
the event ABI before restoring the event channels.

Should we rely on the index for the dependency?

In any case, I think we want to document it.

> +    {
> +        domain_save_handler save = handlers[i].save;
> +
> +        if ( (mask && !test_bit(i, &mask)) || !save )

The type of mask suggests it is not possible to have more than 32 
different types of record if we wanted to be arch agnostic. Even if we 
focus on 64-bit arch, this is only 64 records.

This is not very future proof, but might be ok if this is not exposed 
outside of the hypervisor (I haven't looked at the rest of the series 
yet). However, we at least need a BUILD_BUG_ON() in place. So please 
make sure  DOMAIN_SAVE_CODE_MAX is always less than 64.

Also what if there is a bit set in the mask and the record is not 
existing? Shouldn't we return an error?

> +            continue;
> +
> +        memset(&c.desc, 0, sizeof(c.desc));
> +        c.desc.typecode = i;
> +
> +        if ( handlers[i].per_vcpu )
> +        {
> +            struct vcpu *v;
> +
> +            for_each_vcpu ( d, v )
> +            {
> +                c.desc.instance = v->vcpu_id;
> +
> +                rc = save(v, &c, dry_run);
> +                if ( rc )
> +                    goto out;
> +            }
> +        }
> +        else
> +        {
> +            rc = save(d->vcpu[0], &c, dry_run);
> +            if ( rc )
> +                goto out;
> +        }
> +    }
> +
> +    memset(&c.desc, 0, sizeof(c.desc));
> +    c.desc.typecode = DOMAIN_SAVE_CODE(END);
> +
> +    rc = DOMAIN_SAVE_ENTRY(END, &c, d->vcpu[0], &e, 0);
> +
> + out:
> +    domain_unpause(d);
> +
> +    return rc;
> +}
> +
> +int domain_load_entry(struct domain_context *c, unsigned int tc,
> +                      const char *name, const struct vcpu *v, void *dst,
> +                      size_t dst_len, bool exact)
> +{
> +    int rc;
> +
> +    if ( c->log && tc != DOMAIN_SAVE_CODE(HEADER) &&
> +         tc != DOMAIN_SAVE_CODE(END) )
> +        gdprintk(XENLOG_INFO, "%pv load: %s (%lu)\n", v, name, dst_len);
> +
> +    BUG_ON(tc != c->desc.typecode);
> +    BUG_ON(v->vcpu_id != c->desc.instance);

Is it really warrant to crash the host? What would happen if the values 
were wrong?

> +
> +    if ( (exact ?
> +          (dst_len != c->desc.length) : (dst_len < c->desc.length)) ||

Using ternary in if is really confusing. How about:

dst_len < c->desc.length || (exact && dst_len != c->desc.length) ||

I understand that there would be two check for the exact case but I 
think it is better than a ternary.

However what is the purpose of the param 'exact'? If you set it to false 
how do you know the size you read?

> +         !IS_ALIGNED(c->desc.length, 8) )
> +        return -EINVAL;
> +
> +    rc = c->copy(c->priv, dst, c->desc.length);
> +    if ( rc )
> +        return rc;
> +
> +    if ( !exact )
> +    {
> +        dst += c->desc.length;
> +        memset(dst, 0, dst_len - c->desc.length);
> +    }
> +
> +    return 0;
> +}
> +
> +int domain_load(struct domain *d,  domain_copy_entry copy, void *priv,
> +                unsigned long mask)
> +{
> +    struct domain_context c = {
> +        .copy = copy,
> +        .priv = priv,
> +        .log = true,
> +    };
> +    struct domain_save_header h, e;
> +    int rc;
> +
> +    ASSERT(d != current->domain);
> +
> +    if ( d->is_dying )
> +        return -EINVAL;

What does protect you against the doing dying right after this check?

> +
> +    rc = c.copy(c.priv, &c.desc, sizeof(c.desc));
> +    if ( rc )
> +        return rc;
> +
> +    if ( c.desc.typecode != DOMAIN_SAVE_CODE(HEADER) ||
> +         c.desc.instance != 0 )
> +        return -EINVAL;
> +
> +    rc = DOMAIN_LOAD_ENTRY(HEADER, &c, d->vcpu[0], &h, sizeof(h), true);
> +    if ( rc )
> +        return rc;
> +
> +    if ( h.magic != DOMAIN_SAVE_MAGIC || h.version != DOMAIN_SAVE_VERSION )
> +        return -EINVAL;
> +
> +    domain_pause(d);
> +
> +    for (;;)
> +    {
> +        unsigned int i;
> +        domain_load_handler load;
> +        struct vcpu *v;
> +
> +        rc = c.copy(c.priv, &c.desc, sizeof(c.desc));
> +        if ( rc )
> +            break;
> +
> +        if ( c.desc.typecode == DOMAIN_SAVE_CODE(END) ) {
> +            rc = DOMAIN_LOAD_ENTRY(END, &c, d->vcpu[0], &e, 0, true);
> +            break;
> +        }
> +
> +        rc = -EINVAL;
> +        if ( c.desc.typecode >= ARRAY_SIZE(handlers) ||
> +             c.desc.instance >= d->max_vcpus )

Nothing in the documention suggests that c.desc.instance should be 0 
when the record is for the domain.

> +            break;
> +
> +        i = c.desc.typecode;
> +        load = handlers[i].load;
> +        v = d->vcpu[c.desc.instance];
> +
> +        if ( mask && !test_bit(i, &mask) )
> +        {
> +            /* Sink the data */
> +            rc = c.copy(c.priv, NULL, c.desc.length);
> +            if ( rc )
> +                break;
> +
> +            continue;
> +        }
> +
> +        rc = load ? load(v, &c) : -EOPNOTSUPP;
> +        if ( rc )
> +            break;
> +    }
> +
> +    domain_unpause(d);
> +
> +    return rc;
> +}
> +
> +/*
> + * Local variables:
> + * mode: C
> + * c-file-style: "BSD"
> + * c-basic-offset: 4
> + * tab-width: 4
> + * indent-tabs-mode: nil
> + * End:
> + */
> diff --git a/xen/include/public/save.h b/xen/include/public/save.h
> new file mode 100644
> index 0000000000..84981cd0f6
> --- /dev/null
> +++ b/xen/include/public/save.h
> @@ -0,0 +1,74 @@
> +/*
> + * save.h
> + *
> + * Structure definitions for common PV/HVM domain state that is held by
> + * Xen and must be saved along with the domain's memory.
> + *
> + * Copyright Amazon.com Inc. or its affiliates.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a copy
> + * of this software and associated documentation files (the "Software"), to
> + * deal in the Software without restriction, including without limitation the
> + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
> + * sell copies of the Software, and to permit persons to whom the Software is
> + * furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
> + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
> + * DEALINGS IN THE SOFTWARE.
> + */
> +
> +#ifndef __XEN_PUBLIC_SAVE_H__
> +#define __XEN_PUBLIC_SAVE_H__
> +
> +#include "xen.h"
> +
> +/* Each entry is preceded by a descriptor */
> +struct domain_save_descriptor {
> +    uint16_t typecode;
> +    uint16_t instance;
> +    /*
> +     * Entry length not including this descriptor. Entries must be padded
> +     * to a multiple of 8 bytes to make sure descriptors remain correctly
> +     * aligned.
> +     */
> +    uint32_t length;
> +};
> +
> +/*
> + * Each entry has a type associated with it. DECLARE_DOMAIN_SAVE_TYPE
> + * binds these things together.
> + */
> +#define DECLARE_DOMAIN_SAVE_TYPE(_x, _code, _type) \
> +    struct __DOMAIN_SAVE_TYPE_##_x { _type t; char c[_code]; };
> +
> +#define DOMAIN_SAVE_TYPE(_x) \
> +    typeof (((struct __DOMAIN_SAVE_TYPE_##_x *)(0))->t)
> +#define DOMAIN_SAVE_CODE(_x) \
> +    (sizeof (((struct __DOMAIN_SAVE_TYPE_##_x *)(0))->c))
> +#define DOMAIN_SAVE_MASK(_x) (1u << DOMAIN_SAVE_CODE(_x))
> +
> +/* Terminating entry */
> +struct domain_save_end {};
> +DECLARE_DOMAIN_SAVE_TYPE(END, 0, struct domain_save_end);
> +
> +#define DOMAIN_SAVE_MAGIC   0x53415645
> +#define DOMAIN_SAVE_VERSION 0x00000001
> +
> +/* Initial entry */
> +struct domain_save_header {
> +    uint32_t magic;             /* Must be DOMAIN_SAVE_MAGIC */
> +    uint32_t version;           /* Save format version */

Would it make sense to have the version of Xen in the stream?

> +};
> +DECLARE_DOMAIN_SAVE_TYPE(HEADER, 1, struct domain_save_header);
> +
> +#define DOMAIN_SAVE_CODE_MAX 1
> +
> +#endif /* __XEN_PUBLIC_SAVE_H__ */
> diff --git a/xen/include/xen/save.h b/xen/include/xen/save.h
> new file mode 100644
> index 0000000000..d5846f9e68
> --- /dev/null
> +++ b/xen/include/xen/save.h
> @@ -0,0 +1,115 @@
> +/*
> + * save.h: support routines for save/restore
> + *
> + * Copyright Amazon.com Inc. or its affiliates.
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms and conditions of the GNU General Public License,
> + * version 2, as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope it will be useful, but WITHOUT
> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
> + * more details.
> + *
> + * You should have received a copy of the GNU General Public License along with
> + * this program; If not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#ifndef __XEN_SAVE_H__
> +#define __XEN_SAVE_H__
> +
> +#include <xen/sched.h>
> +#include <xen/types.h>
> +#include <xen/init.h>
> +
> +#include <public/xen.h>
> +#include <public/save.h>
> +
> +struct domain_context;
> +
> +int domain_save_entry(struct domain_context *c, unsigned int tc,
> +                      const char *name, const struct vcpu *v, void *src,
> +                      size_t src_len);
> +
> +#define DOMAIN_SAVE_ENTRY(_x, _c, _v, _src, _len)                        \
> +        domain_save_entry((_c), DOMAIN_SAVE_CODE(_x), #_x, (_v), (_src), \
> +                          (_len));
> +
> +int domain_load_entry(struct domain_context *c, unsigned int tc,
> +                      const char *name, const struct vcpu *v, void *dest,
> +                      size_t dest_len, bool exact);
> +
> +#define DOMAIN_LOAD_ENTRY(_x, _c, _v, _src, _len, _exact)                \
> +        domain_load_entry((_c), DOMAIN_SAVE_CODE(_x), #_x, (_v), (_src), \
> +                          (_len), (_exact));
> +
> +/*
> + * The 'dry_run' flag indicates that the caller of domain_save() (see
> + * below) is not trying to actually acquire the data, only the size
> + * of the data. The save handler can therefore limit work to only that
> + * which is necessary to call DOMAIN_SAVE_ENTRY() with an accurate value
> + * for '_len'.
> + */
> +typedef int (*domain_save_handler)(const struct vcpu *v,
> +                                   struct domain_context *h,
> +                                   bool dry_run);
> +typedef int (*domain_load_handler)(struct vcpu *v,
> +                                   struct domain_context *h);
> +
> +void domain_register_save_type(unsigned int tc, const char *name,
> +                               bool per_vcpu,
> +                               domain_save_handler save,
> +                               domain_load_handler load);
> +
> +#define DOMAIN_REGISTER_SAVE_RESTORE(_x, _per_vcpu, _save, _load) \
> +static int __init __domain_register_##_x##_save_restore(void)     \
> +{                                                                 \
> +    domain_register_save_type(                                    \
> +        DOMAIN_SAVE_CODE(_x),                                     \
> +        #_x,                                                      \
> +        (_per_vcpu),                                              \
> +        &(_save),                                                 \
> +        &(_load));                                                \
> +                                                                  \
> +    return 0;                                                     \
> +}                                                                 \
> +__initcall(__domain_register_##_x##_save_restore);
> +
> +/* Copy callback function */
> +typedef int (*domain_copy_entry)(void *priv, void *data, size_t len);
> +
> +/*
> + * Entry points:
> + *
> + * int domain_save(struct domain *d, domain_copy_entry copy, void *priv,
> + *                 unsigned long mask, bool dry_run);
> + * int domain_load(struct domain *d, domain_copy_entry copy, void *priv,
> + *                 unsigned long mask);
> + *
> + * copy:    This is a callback function provided by the caller that will be
> + *          used to write to (in the save case) or read from (in the load
> + *          case) the context buffer.
> + * priv:    This is a pointer that will be passed to the copy function to
> + *          allow it to identify the context buffer and the current state
> + *          of the save or load operation.
> + * mask:    This is a mask to determine which save record types should be
> + *          copied to or from the buffer.
> + *          If it is zero then all save record types will be copied.
> + *          If it is non-zero then only record types with codes
> + *          corresponding to set bits will be copied. I.e. to copy save
> + *          record 'type', set the bit in position DOMAIN_SAVE_CODE(type).
> + *          DOMAIN_SAVE_CODE(HEADER) and DOMAIN_SAVE_CODE(END) records must
> + *          always be present and thus will be copied regardless of whether
> + *          the bits in those positions are set or not.
> + * dry_run: See the comment concerning (*domain_save) above.
> + *
> + * NOTE: A convenience macro, DOMAIN_SAVE_MASK(type), is defined to support
> + *       setting bits in the mask field.
> + */
> +int domain_save(struct domain *d, domain_copy_entry copy, void *priv,
> +                unsigned long mask, bool dry_run);
> +int domain_load(struct domain *d, domain_copy_entry copy, void *priv,
> +                unsigned long mask);
> +
> +#endif /* __XEN_SAVE_H__ */
> 

Cheers,
Jan Beulich April 1, 2020, 12:07 p.m. UTC | #2
On 01.04.2020 14:00, Julien Grall wrote:
> On 27/03/2020 18:50, Paul Durrant wrote:
>> +    if ( (exact ?
>> +          (dst_len != c->desc.length) : (dst_len < c->desc.length)) ||
> 
> Using ternary in if is really confusing. How about:
> 
> dst_len < c->desc.length || (exact && dst_len != c->desc.length) ||
> 
> I understand that there would be two check for the exact case but I think it is better than a ternary.

I'm of the opposite opinion, and hence with Paul. While the alternative
you suggest is still reasonable because of the special case here, I
find it confusing / more difficult to read / follow

    if ( (a && b) || (!a && c) )

(and I've seen quite a few instances of such over time) instead of

    if ( a ? b : c )

Jan
Julien Grall April 1, 2020, 12:16 p.m. UTC | #3
Hi Jan,

On 01/04/2020 13:07, Jan Beulich wrote:
> On 01.04.2020 14:00, Julien Grall wrote:
>> On 27/03/2020 18:50, Paul Durrant wrote:
>>> +    if ( (exact ?
>>> +          (dst_len != c->desc.length) : (dst_len < c->desc.length)) ||
>>
>> Using ternary in if is really confusing. How about:
>>
>> dst_len < c->desc.length || (exact && dst_len != c->desc.length) ||
>>
>> I understand that there would be two check for the exact case but I think it is better than a ternary.
> 
> I'm of the opposite opinion, and hence with Paul. While the alternative
> you suggest is still reasonable because of the special case here, I
> find it confusing / more difficult to read / follow
> 
>      if ( (a && b) || (!a && c) )
> 
> (and I've seen quite a few instances of such over time) instead of
> 
>      if ( a ? b : c )

If the ternary was the only condition and in a single line then it would 
be okay. However, the if is split over 3 lines...

The more stuff you put in an if, then more chance you are going to 
misread/make a mistake (you likely know what I am referring about here ;)).

So if you prefer the ternary, then we should at least write 2 ifs.

Cheers,
Jan Beulich April 1, 2020, 12:23 p.m. UTC | #4
On 01.04.2020 14:16, Julien Grall wrote:
> Hi Jan,
> 
> On 01/04/2020 13:07, Jan Beulich wrote:
>> On 01.04.2020 14:00, Julien Grall wrote:
>>> On 27/03/2020 18:50, Paul Durrant wrote:
>>>> +    if ( (exact ?
>>>> +          (dst_len != c->desc.length) : (dst_len < c->desc.length)) ||
>>>
>>> Using ternary in if is really confusing. How about:
>>>
>>> dst_len < c->desc.length || (exact && dst_len != c->desc.length) ||
>>>
>>> I understand that there would be two check for the exact case but I think it is better than a ternary.
>>
>> I'm of the opposite opinion, and hence with Paul. While the alternative
>> you suggest is still reasonable because of the special case here, I
>> find it confusing / more difficult to read / follow
>>
>>      if ( (a && b) || (!a && c) )
>>
>> (and I've seen quite a few instances of such over time) instead of
>>
>>      if ( a ? b : c )
> 
> If the ternary was the only condition and in a single line then it would be okay. However, the if is split over 3 lines...
> 
> The more stuff you put in an if, then more chance you are going to misread/make a mistake (you likely know what I am referring about here ;)).
> 
> So if you prefer the ternary, then we should at least write 2 ifs.

Since it's || that would be fine (albeit not preferred) by me. If
it was &&, would be be suggesting two nested if()-s (which
generally in reviews we ask to be avoided)? I see nothing wrong
with e.g.

      if ( (a ? b : c) &&
           (x ? y : z) )

Nevertheless I agree that very large conditionals often are more
difficult to read.

Jan
Jan Beulich April 1, 2020, 2:50 p.m. UTC | #5
On 27.03.2020 19:50, Paul Durrant wrote:
> Domain context is state held in the hypervisor that does not come under
> the category of 'HVM state' but is instead 'PV state' that is common
> between PV guests and enlightened HVM guests (i.e. those that have PV
> drivers) such as event channel state, grant entry state, etc.

Without looking at the patch details yet, I'm having some difficulty
understanding how this is going to work in a safe/correct manner. I
suppose for LU the system is in a frozen enough state that
snapshotting and copying state like this is okay, but ...

> To allow enlightened HVM guests to be migrated without their co-operation
> it will be necessary to transfer such state along with the domain's
> memory image, architectural state, etc. This framework is introduced for
> that purpose.
> 
> This patch adds the new public header and the low level implementation,
> entered via the domain_save() or domain_load() functions. Subsequent
> patches will introduce other parts of the framwork, and code that will
> make use of it within the current version of the libxc migration stream.

... here you suggest (and patch 5 appears to match this) that this
is going to be used even in "normal" migration streams. All of the
items named are communication vehicles, and hence there are always
two sides that can influence the state. For event channels, the
other side (which isn't paused) or the hardware (for passed through
devices) might signal them, or it (just the former obviously) could
close their end, resulting in a state change also for the domain
being migrated. If this happens after the snapshot was taken, the
state change is lost.

Otoh I'm sure the case was considered, so perhaps I'm simply missing
some crucial aspect (which then could do with spelling out in the
description of the cover letter).

Jan
Paul Durrant April 2, 2020, 9:58 a.m. UTC | #6
> -----Original Message-----
> From: Jan Beulich <jbeulich@suse.com>
> Sent: 01 April 2020 15:51
> To: Paul Durrant <paul@xen.org>
> Cc: xen-devel@lists.xenproject.org; Andrew Cooper <andrew.cooper3@citrix.com>; George Dunlap
> <george.dunlap@citrix.com>; Ian Jackson <ian.jackson@eu.citrix.com>; Julien Grall <julien@xen.org>;
> Stefano Stabellini <sstabellini@kernel.org>; Wei Liu <wl@xen.org>
> Subject: Re: [PATCH 1/5] xen/common: introduce a new framework for save/restore of 'domain' context
> 
> On 27.03.2020 19:50, Paul Durrant wrote:
> > Domain context is state held in the hypervisor that does not come under
> > the category of 'HVM state' but is instead 'PV state' that is common
> > between PV guests and enlightened HVM guests (i.e. those that have PV
> > drivers) such as event channel state, grant entry state, etc.
> 
> Without looking at the patch details yet, I'm having some difficulty
> understanding how this is going to work in a safe/correct manner. I
> suppose for LU the system is in a frozen enough state that
> snapshotting and copying state like this is okay, but ...
> 
> > To allow enlightened HVM guests to be migrated without their co-operation
> > it will be necessary to transfer such state along with the domain's
> > memory image, architectural state, etc. This framework is introduced for
> > that purpose.
> >
> > This patch adds the new public header and the low level implementation,
> > entered via the domain_save() or domain_load() functions. Subsequent
> > patches will introduce other parts of the framwork, and code that will
> > make use of it within the current version of the libxc migration stream.
> 
> ... here you suggest (and patch 5 appears to match this) that this
> is going to be used even in "normal" migration streams.

Well, 'transparent' (or non-cooperative) migration will only work in some cases but it definitely does work.

> All of the
> items named are communication vehicles, and hence there are always
> two sides that can influence the state. For event channels, the
> other side (which isn't paused) or the hardware (for passed through
> devices) might signal them, or it (just the former obviously) could
> close their end, resulting in a state change also for the domain
> being migrated. If this happens after the snapshot was taken, the
> state change is lost.

Indeed, which is why we *do* rely on co-operation from the other end of the event channels in the migration case. In the initial case it is likely we'll veto transparent migration unless all event channels are connected to either dom0 or Xen.

> 
> Otoh I'm sure the case was considered, so perhaps I'm simply missing
> some crucial aspect (which then could do with spelling out in the
> description of the cover letter).
> 

Does that need to be explained for a series that is just infrastructure? The overall design doc is now committed in the repo (although may need some expansion in future) so I could point at that.
I don't think I'm giving anything away when I say that EC2's downstream code simply (ab)uses HVM save records for transferring the extra state; all I'm trying to do here is create something cleaner onto which I can re-base and upstream the EC2 code.

  Paul
Jan Beulich April 2, 2020, 11:08 a.m. UTC | #7
On 02.04.2020 11:58, Paul Durrant wrote:
>> -----Original Message-----
>> From: Jan Beulich <jbeulich@suse.com>
>> Sent: 01 April 2020 15:51
>> To: Paul Durrant <paul@xen.org>
>> Cc: xen-devel@lists.xenproject.org; Andrew Cooper <andrew.cooper3@citrix.com>; George Dunlap
>> <george.dunlap@citrix.com>; Ian Jackson <ian.jackson@eu.citrix.com>; Julien Grall <julien@xen.org>;
>> Stefano Stabellini <sstabellini@kernel.org>; Wei Liu <wl@xen.org>
>> Subject: Re: [PATCH 1/5] xen/common: introduce a new framework for save/restore of 'domain' context
>>
>> On 27.03.2020 19:50, Paul Durrant wrote:
>>> Domain context is state held in the hypervisor that does not come under
>>> the category of 'HVM state' but is instead 'PV state' that is common
>>> between PV guests and enlightened HVM guests (i.e. those that have PV
>>> drivers) such as event channel state, grant entry state, etc.
>>
>> Without looking at the patch details yet, I'm having some difficulty
>> understanding how this is going to work in a safe/correct manner. I
>> suppose for LU the system is in a frozen enough state that
>> snapshotting and copying state like this is okay, but ...
>>
>>> To allow enlightened HVM guests to be migrated without their co-operation
>>> it will be necessary to transfer such state along with the domain's
>>> memory image, architectural state, etc. This framework is introduced for
>>> that purpose.
>>>
>>> This patch adds the new public header and the low level implementation,
>>> entered via the domain_save() or domain_load() functions. Subsequent
>>> patches will introduce other parts of the framwork, and code that will
>>> make use of it within the current version of the libxc migration stream.
>>
>> ... here you suggest (and patch 5 appears to match this) that this
>> is going to be used even in "normal" migration streams.
> 
> Well, 'transparent' (or non-cooperative) migration will only work in some cases but it definitely does work.
> 
>> All of the
>> items named are communication vehicles, and hence there are always
>> two sides that can influence the state. For event channels, the
>> other side (which isn't paused) or the hardware (for passed through
>> devices) might signal them, or it (just the former obviously) could
>> close their end, resulting in a state change also for the domain
>> being migrated. If this happens after the snapshot was taken, the
>> state change is lost.
> 
> Indeed, which is why we *do* rely on co-operation from the other end
> of the event channels in the migration case. In the initial case it
> is likely we'll veto transparent migration unless all event channels
> are connected to either dom0 or Xen.

Co-operation for "normal" migration, iirc, consists of tearing down
and re-establishing everything. There's simply no risk of losing e.g.
events this way.

>> Otoh I'm sure the case was considered, so perhaps I'm simply missing
>> some crucial aspect (which then could do with spelling out in the
>> description of the cover letter).
>>
> 
> Does that need to be explained for a series that is just
> infrastructure?

I think so, yes - this infrastructure is pointless to introduce if
it doesn't allow fulfilling all requirements. Pointing at the design
doc (in the cover letter) may be enough if all aspects are covered
by what's there. I wouldn't have assumed using this infrastructure
also for co-operative migration to also be mentioned there.

Considering the situation with event channels (all closed), doing
what you do for the shared info page is probably going to be fine;
large parts of it are in a known state (or need re-filling on the
destination) anyway. What other plans do you have for non-LU
migration wrt this new infrastructure? If the shared info page is
all that's going to get migrated with its help, I'd wonder whether
the infrastructure wasn't better conditional upon a LU config
option, and the shared info migration was left as it is now.

> The overall design doc is now committed in the repo (although may
> need some expansion in future) so I could point at that.
> I don't think I'm giving anything away when I say that EC2's
> downstream code simply (ab)uses HVM save records for transferring
> the extra state; all I'm trying to do here is create something
> cleaner onto which I can re-base and upstream the EC2 code.

That was my guess, indeed.

Jan
Paul Durrant April 2, 2020, 2 p.m. UTC | #8
> -----Original Message-----
> From: Jan Beulich <jbeulich@suse.com>
> Sent: 02 April 2020 12:08
> To: paul@xen.org
> Cc: xen-devel@lists.xenproject.org; 'Andrew Cooper' <andrew.cooper3@citrix.com>; 'George Dunlap'
> <george.dunlap@citrix.com>; 'Ian Jackson' <ian.jackson@eu.citrix.com>; 'Julien Grall'
> <julien@xen.org>; 'Stefano Stabellini' <sstabellini@kernel.org>; 'Wei Liu' <wl@xen.org>
> Subject: Re: [PATCH 1/5] xen/common: introduce a new framework for save/restore of 'domain' context
> 
> On 02.04.2020 11:58, Paul Durrant wrote:
> >> -----Original Message-----
> >> From: Jan Beulich <jbeulich@suse.com>
> >> Sent: 01 April 2020 15:51
> >> To: Paul Durrant <paul@xen.org>
> >> Cc: xen-devel@lists.xenproject.org; Andrew Cooper <andrew.cooper3@citrix.com>; George Dunlap
> >> <george.dunlap@citrix.com>; Ian Jackson <ian.jackson@eu.citrix.com>; Julien Grall <julien@xen.org>;
> >> Stefano Stabellini <sstabellini@kernel.org>; Wei Liu <wl@xen.org>
> >> Subject: Re: [PATCH 1/5] xen/common: introduce a new framework for save/restore of 'domain' context
> >>
> >> On 27.03.2020 19:50, Paul Durrant wrote:
> >>> Domain context is state held in the hypervisor that does not come under
> >>> the category of 'HVM state' but is instead 'PV state' that is common
> >>> between PV guests and enlightened HVM guests (i.e. those that have PV
> >>> drivers) such as event channel state, grant entry state, etc.
> >>
> >> Without looking at the patch details yet, I'm having some difficulty
> >> understanding how this is going to work in a safe/correct manner. I
> >> suppose for LU the system is in a frozen enough state that
> >> snapshotting and copying state like this is okay, but ...
> >>
> >>> To allow enlightened HVM guests to be migrated without their co-operation
> >>> it will be necessary to transfer such state along with the domain's
> >>> memory image, architectural state, etc. This framework is introduced for
> >>> that purpose.
> >>>
> >>> This patch adds the new public header and the low level implementation,
> >>> entered via the domain_save() or domain_load() functions. Subsequent
> >>> patches will introduce other parts of the framwork, and code that will
> >>> make use of it within the current version of the libxc migration stream.
> >>
> >> ... here you suggest (and patch 5 appears to match this) that this
> >> is going to be used even in "normal" migration streams.
> >
> > Well, 'transparent' (or non-cooperative) migration will only work in some cases but it definitely
> does work.
> >
> >> All of the
> >> items named are communication vehicles, and hence there are always
> >> two sides that can influence the state. For event channels, the
> >> other side (which isn't paused) or the hardware (for passed through
> >> devices) might signal them, or it (just the former obviously) could
> >> close their end, resulting in a state change also for the domain
> >> being migrated. If this happens after the snapshot was taken, the
> >> state change is lost.
> >
> > Indeed, which is why we *do* rely on co-operation from the other end
> > of the event channels in the migration case. In the initial case it
> > is likely we'll veto transparent migration unless all event channels
> > are connected to either dom0 or Xen.
> 
> Co-operation for "normal" migration, iirc, consists of tearing down
> and re-establishing everything. There's simply no risk of losing e.g.
> events this way.

No, indeed, everything basically has to be re-established from scratch for normal migration. Transparent migration, as it is implemented at the moment, does rely on injecting potentially spurious events on resume. I think the alternative would be to have the PV backends send an event when they re-connect to a shared ring rather than having Xen do it.

> 
> >> Otoh I'm sure the case was considered, so perhaps I'm simply missing
> >> some crucial aspect (which then could do with spelling out in the
> >> description of the cover letter).
> >>
> >
> > Does that need to be explained for a series that is just
> > infrastructure?
> 
> I think so, yes - this infrastructure is pointless to introduce if
> it doesn't allow fulfilling all requirements. Pointing at the design
> doc (in the cover letter) may be enough if all aspects are covered
> by what's there. I wouldn't have assumed using this infrastructure
> also for co-operative migration to also be mentioned there.

No, I can mention the plan to use it to replace existing libxc migration records in the cover letter even though it is stated in the comment for patch #5.

> 
> Considering the situation with event channels (all closed), doing
> what you do for the shared info page is probably going to be fine;
> large parts of it are in a known state (or need re-filling on the
> destination) anyway. What other plans do you have for non-LU
> migration wrt this new infrastructure?

Well, as discussed above, event channels are one, then there's the grant table. The downstream code as a record for the wallclock but I think the RTC covers that now that added the code to preserve the offset. We also have records for vcpu timers and runstate, but I'm not convinced we actually need those.

> If the shared info page is
> all that's going to get migrated with its help, I'd wonder whether
> the infrastructure wasn't better conditional upon a LU config
> option, and the shared info migration was left as it is now.
> 

The shared info is also migrated for HVM guests so it would have meant moving the mapping code around anyway, thus it seems sensible to use the new domain context for that for PV guests in their normal migration stream anyway.

  Paul
Jan Beulich April 3, 2020, 8:38 a.m. UTC | #9
On 02.04.2020 16:00, Paul Durrant wrote:
>> -----Original Message-----
>> From: Jan Beulich <jbeulich@suse.com>
>> Sent: 02 April 2020 12:08
>>
>> Considering the situation with event channels (all closed), doing
>> what you do for the shared info page is probably going to be fine;
>> large parts of it are in a known state (or need re-filling on the
>> destination) anyway. What other plans do you have for non-LU
>> migration wrt this new infrastructure?
> 
> Well, as discussed above, event channels are one, then there's the grant table. The downstream code as a record for the wallclock but I think the RTC covers that now that added the code to preserve the offset. We also have records for vcpu timers and runstate, but I'm not convinced we actually need those.

Timers may need preserving, but runstate may be avoidable. Depends
on whether the accumulation in time[4] is fine to start from zero
again after (transparent) migration.

>> If the shared info page is
>> all that's going to get migrated with its help, I'd wonder whether
>> the infrastructure wasn't better conditional upon a LU config
>> option, and the shared info migration was left as it is now.
> 
> The shared info is also migrated for HVM guests so it would have meant moving the mapping code around anyway, thus it seems sensible to use the new domain context for that for PV guests in their normal migration stream anyway.

Hmm, okay - I'll see to get to look at the actual code then.

Jan
Paul Durrant April 3, 2020, 3:55 p.m. UTC | #10
> -----Original Message-----
[snip]
> > +
> > +#include <xen/save.h>
> > +
> > +struct domain_context {
> > +    bool log;
> > +    struct domain_save_descriptor desc;
> > +    domain_copy_entry copy;
> 
> As your new framework is technically an extension of existing one, it
> would be good to explain why we diverge in the definitions.
> 

I don't follow. What is diverging? I explain in the commit comment that this is a parallel framework. Do I need to justify why it is not a carbon copy of the HVM one?

> > +    void *priv;
> > +};
> > +
> > +static struct {
> > +    const char *name;
> > +    bool per_vcpu;
> > +    domain_save_handler save;
> > +    domain_load_handler load;
> > +} handlers[DOMAIN_SAVE_CODE_MAX + 1];
> > +
> > +void __init domain_register_save_type(unsigned int tc, const char *name,
> > +                                      bool per_vcpu,
> > +                                      domain_save_handler save,
> > +                                      domain_load_handler load)
> > +{
> > +    BUG_ON(tc > ARRAY_SIZE(handlers));
> > +
> > +    ASSERT(!handlers[tc].save);
> > +    ASSERT(!handlers[tc].load);
> > +
> > +    handlers[tc].name = name;
> > +    handlers[tc].per_vcpu = per_vcpu;
> > +    handlers[tc].save = save;
> > +    handlers[tc].load = load;
> > +}
> > +
> > +int domain_save_entry(struct domain_context *c, unsigned int tc,
> > +                      const char *name, const struct vcpu *v, void *src,
> 
> I realize that 'src' is not const because how you define copy, however I
> would rather prefer if we rework the interface to avoid to keep the
> const in the definition. This may mean to have to define two callback
> rather than one.

That seems a bit ugly for the sake of a const but I guess I could create a union with a copy_in and copy_out. I'll see how that looks.

> 
> > +                      size_t src_len)
> 
> On 64-bit architecture, size_t would be 64-bit. But the record is only
> storing 32-bit. Would it make sense to add some sanity check in the code?
> 

True. Given this is new I think I'll just bump the length to 64-bits.

> > +{
> > +    int rc;
> > +
> > +    if ( c->log && tc != DOMAIN_SAVE_CODE(HEADER) &&
> > +         tc != DOMAIN_SAVE_CODE(END) )
> > +        gdprintk(XENLOG_INFO, "%pv save: %s (%lu)\n", v, name, src_len);
> > +
> > +    if ( !IS_ALIGNED(src_len, 8) )
> 
> Why not adding an implicit padding? This would avoid to chase error
> during saving because the len wasn't a multiple of 8.
> 

I don't think implicit padding is worth it. This error should only happen if a badly defined save record type is added to the code so perhaps I ought to add an ASSERT here as well as deal with the error.

> > +        return -EINVAL;
> > +
> > +    BUG_ON(tc != c->desc.typecode);
> > +    BUG_ON(v->vcpu_id != c->desc.instance);
> 
> Does the descriptor really need to be filled by domain_save()? Can't it
> be done here, so we can avoid the two BUG_ON()s?
> 

Yes it can but this serves as a sanity check that the save code is actually doing what it should. Hence why these are BUG_ON()s and not error exits.

> > +    c->desc.length = src_len;
> > +
> > +    rc = c->copy(c->priv, &c->desc, sizeof(c->desc));
> > +    if ( rc )
> > +        return rc;
> > +
> > +    return c->copy(c->priv, src, src_len);
> > +}
> > +
> > +int domain_save(struct domain *d, domain_copy_entry copy, void *priv,
> > +                unsigned long mask, bool dry_run)
> > +{
> > +    struct domain_context c = {
> > +        .copy = copy,
> > +        .priv = priv,
> > +        .log = !dry_run,
> > +    };
> > +    struct domain_save_header h = {
> > +        .magic = DOMAIN_SAVE_MAGIC,
> > +        .version = DOMAIN_SAVE_VERSION,
> > +    };
> > +    struct domain_save_header e;
> > +    unsigned int i;
> > +    int rc;
> > +
> > +    ASSERT(d != current->domain);
> > +
> > +    if ( d->is_dying )
> > +        return -EINVAL;
> > +
> > +    domain_pause(d);
> > +
> > +    c.desc.typecode = DOMAIN_SAVE_CODE(HEADER);
> > +
> > +    rc = DOMAIN_SAVE_ENTRY(HEADER, &c, d->vcpu[0], &h, sizeof(h));
> > +    if ( rc )
> > +        goto out;
> > +
> > +    for ( i = 0; i < ARRAY_SIZE(handlers); i++ )
> 
> AFAIU, with this solution, if there are dependency between records, the
> dependencies will have to a lower "index". I think we may have some
> dependency with guest transparent migration such as we need to restore
> the event ABI before restoring the event channels.
> 

Is that just a downstream implementation detail though? I would hope that there are no ordering dependencies between records.

> Should we rely on the index for the dependency?
>

If we do need ordering dependencies then I guess it would need to be explicit.
 
> In any case, I think we want to document it.
>

I can certainly document that save handlers are invoked in code order.

> > +    {
> > +        domain_save_handler save = handlers[i].save;
> > +
> > +        if ( (mask && !test_bit(i, &mask)) || !save )
> 
> The type of mask suggests it is not possible to have more than 32
> different types of record if we wanted to be arch agnostic. Even if we
> focus on 64-bit arch, this is only 64 records.
> 
> This is not very future proof, but might be ok if this is not exposed
> outside of the hypervisor (I haven't looked at the rest of the series
> yet). However, we at least need a BUILD_BUG_ON() in place. So please
> make sure  DOMAIN_SAVE_CODE_MAX is always less than 64.
> 
> Also what if there is a bit set in the mask and the record is not
> existing? Shouldn't we return an error?
> 

TBH I think 32 will be enough... I would not expect this context space to grow very much, if at all, once transparent migration is working. I think I'll just drop the mask for now; it's not actually clear to me we'll make use of it... just seemed like a nice-to-have.

> > +            continue;
> > +
> > +        memset(&c.desc, 0, sizeof(c.desc));
> > +        c.desc.typecode = i;
> > +
> > +        if ( handlers[i].per_vcpu )
> > +        {
> > +            struct vcpu *v;
> > +
> > +            for_each_vcpu ( d, v )
> > +            {
> > +                c.desc.instance = v->vcpu_id;
> > +
> > +                rc = save(v, &c, dry_run);
> > +                if ( rc )
> > +                    goto out;
> > +            }
> > +        }
> > +        else
> > +        {
> > +            rc = save(d->vcpu[0], &c, dry_run);
> > +            if ( rc )
> > +                goto out;
> > +        }
> > +    }
> > +
> > +    memset(&c.desc, 0, sizeof(c.desc));
> > +    c.desc.typecode = DOMAIN_SAVE_CODE(END);
> > +
> > +    rc = DOMAIN_SAVE_ENTRY(END, &c, d->vcpu[0], &e, 0);
> > +
> > + out:
> > +    domain_unpause(d);
> > +
> > +    return rc;
> > +}
> > +
> > +int domain_load_entry(struct domain_context *c, unsigned int tc,
> > +                      const char *name, const struct vcpu *v, void *dst,
> > +                      size_t dst_len, bool exact)
> > +{
> > +    int rc;
> > +
> > +    if ( c->log && tc != DOMAIN_SAVE_CODE(HEADER) &&
> > +         tc != DOMAIN_SAVE_CODE(END) )
> > +        gdprintk(XENLOG_INFO, "%pv load: %s (%lu)\n", v, name, dst_len);
> > +
> > +    BUG_ON(tc != c->desc.typecode);
> > +    BUG_ON(v->vcpu_id != c->desc.instance);
> 
> Is it really warrant to crash the host? What would happen if the values
> were wrong?
> 

It would mean the code is fairly seriously buggy in that the load handler is trying to load a record other than the type it registered for, or for a vcpu other than the one it was passed.

> > +
> > +    if ( (exact ?
> > +          (dst_len != c->desc.length) : (dst_len < c->desc.length)) ||
> 
> Using ternary in if is really confusing. How about:
> 
> dst_len < c->desc.length || (exact && dst_len != c->desc.length) ||
> 
> I understand that there would be two check for the exact case but I
> think it is better than a ternary.

I'm going to re-work this I think.

> 
> However what is the purpose of the param 'exact'? If you set it to false
> how do you know the size you read?

The point of the exact parameter is that whether the caller can only consume a record of the correct length, or whether it can handle an undersize one which gets zero-padded. (It's the same as the zeroextend option in HVM records).

> 
> > +         !IS_ALIGNED(c->desc.length, 8) )
> > +        return -EINVAL;
> > +
> > +    rc = c->copy(c->priv, dst, c->desc.length);
> > +    if ( rc )
> > +        return rc;
> > +
> > +    if ( !exact )
> > +    {
> > +        dst += c->desc.length;
> > +        memset(dst, 0, dst_len - c->desc.length);
> > +    }
> > +
> > +    return 0;
> > +}
> > +
> > +int domain_load(struct domain *d,  domain_copy_entry copy, void *priv,
> > +                unsigned long mask)
> > +{
> > +    struct domain_context c = {
> > +        .copy = copy,
> > +        .priv = priv,
> > +        .log = true,
> > +    };
> > +    struct domain_save_header h, e;
> > +    int rc;
> > +
> > +    ASSERT(d != current->domain);
> > +
> > +    if ( d->is_dying )
> > +        return -EINVAL;
> 
> What does protect you against the doing dying right after this check?
> 

Nothing. It's just to avoid doing pointless work if we can.

> > +
> > +    rc = c.copy(c.priv, &c.desc, sizeof(c.desc));
> > +    if ( rc )
> > +        return rc;
> > +
> > +    if ( c.desc.typecode != DOMAIN_SAVE_CODE(HEADER) ||
> > +         c.desc.instance != 0 )
> > +        return -EINVAL;
> > +
> > +    rc = DOMAIN_LOAD_ENTRY(HEADER, &c, d->vcpu[0], &h, sizeof(h), true);
> > +    if ( rc )
> > +        return rc;
> > +
> > +    if ( h.magic != DOMAIN_SAVE_MAGIC || h.version != DOMAIN_SAVE_VERSION )
> > +        return -EINVAL;
> > +
> > +    domain_pause(d);
> > +
> > +    for (;;)
> > +    {
> > +        unsigned int i;
> > +        domain_load_handler load;
> > +        struct vcpu *v;
> > +
> > +        rc = c.copy(c.priv, &c.desc, sizeof(c.desc));
> > +        if ( rc )
> > +            break;
> > +
> > +        if ( c.desc.typecode == DOMAIN_SAVE_CODE(END) ) {
> > +            rc = DOMAIN_LOAD_ENTRY(END, &c, d->vcpu[0], &e, 0, true);
> > +            break;
> > +        }
> > +
> > +        rc = -EINVAL;
> > +        if ( c.desc.typecode >= ARRAY_SIZE(handlers) ||
> > +             c.desc.instance >= d->max_vcpus )
> 
> Nothing in the documention suggests that c.desc.instance should be 0
> when the record is for the domain.
> 

Ok. I'll put a comment somewhere.

> > +            break;
> > +
> > +        i = c.desc.typecode;
> > +        load = handlers[i].load;
> > +        v = d->vcpu[c.desc.instance];
> > +
> > +        if ( mask && !test_bit(i, &mask) )
> > +        {
> > +            /* Sink the data */
> > +            rc = c.copy(c.priv, NULL, c.desc.length);
> > +            if ( rc )
> > +                break;
> > +
> > +            continue;
> > +        }
> > +
> > +        rc = load ? load(v, &c) : -EOPNOTSUPP;
> > +        if ( rc )
> > +            break;
> > +    }
> > +
> > +    domain_unpause(d);
> > +
> > +    return rc;
> > +}
> > +
> > +/*
> > + * Local variables:
> > + * mode: C
> > + * c-file-style: "BSD"
> > + * c-basic-offset: 4
> > + * tab-width: 4
> > + * indent-tabs-mode: nil
> > + * End:
> > + */
> > diff --git a/xen/include/public/save.h b/xen/include/public/save.h
> > new file mode 100644
> > index 0000000000..84981cd0f6
> > --- /dev/null
> > +++ b/xen/include/public/save.h
> > @@ -0,0 +1,74 @@
> > +/*
> > + * save.h
> > + *
> > + * Structure definitions for common PV/HVM domain state that is held by
> > + * Xen and must be saved along with the domain's memory.
> > + *
> > + * Copyright Amazon.com Inc. or its affiliates.
> > + *
> > + * Permission is hereby granted, free of charge, to any person obtaining a copy
> > + * of this software and associated documentation files (the "Software"), to
> > + * deal in the Software without restriction, including without limitation the
> > + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
> > + * sell copies of the Software, and to permit persons to whom the Software is
> > + * furnished to do so, subject to the following conditions:
> > + *
> > + * The above copyright notice and this permission notice shall be included in
> > + * all copies or substantial portions of the Software.
> > + *
> > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> > + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> > + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
> > + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> > + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> > + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
> > + * DEALINGS IN THE SOFTWARE.
> > + */
> > +
> > +#ifndef __XEN_PUBLIC_SAVE_H__
> > +#define __XEN_PUBLIC_SAVE_H__
> > +
> > +#include "xen.h"
> > +
> > +/* Each entry is preceded by a descriptor */
> > +struct domain_save_descriptor {
> > +    uint16_t typecode;
> > +    uint16_t instance;
> > +    /*
> > +     * Entry length not including this descriptor. Entries must be padded
> > +     * to a multiple of 8 bytes to make sure descriptors remain correctly
> > +     * aligned.
> > +     */
> > +    uint32_t length;
> > +};
> > +
> > +/*
> > + * Each entry has a type associated with it. DECLARE_DOMAIN_SAVE_TYPE
> > + * binds these things together.
> > + */
> > +#define DECLARE_DOMAIN_SAVE_TYPE(_x, _code, _type) \
> > +    struct __DOMAIN_SAVE_TYPE_##_x { _type t; char c[_code]; };
> > +
> > +#define DOMAIN_SAVE_TYPE(_x) \
> > +    typeof (((struct __DOMAIN_SAVE_TYPE_##_x *)(0))->t)
> > +#define DOMAIN_SAVE_CODE(_x) \
> > +    (sizeof (((struct __DOMAIN_SAVE_TYPE_##_x *)(0))->c))
> > +#define DOMAIN_SAVE_MASK(_x) (1u << DOMAIN_SAVE_CODE(_x))
> > +
> > +/* Terminating entry */
> > +struct domain_save_end {};
> > +DECLARE_DOMAIN_SAVE_TYPE(END, 0, struct domain_save_end);
> > +
> > +#define DOMAIN_SAVE_MAGIC   0x53415645
> > +#define DOMAIN_SAVE_VERSION 0x00000001
> > +
> > +/* Initial entry */
> > +struct domain_save_header {
> > +    uint32_t magic;             /* Must be DOMAIN_SAVE_MAGIC */
> > +    uint32_t version;           /* Save format version */
> 
> Would it make sense to have the version of Xen in the stream?
> 

Why? How would it help?

  Paul
Julien Grall April 3, 2020, 5:24 p.m. UTC | #11
Hi Paul,

On 03/04/2020 16:55, Paul Durrant wrote:
>> -----Original Message-----
> [snip]
>>> +
>>> +#include <xen/save.h>
>>> +
>>> +struct domain_context {
>>> +    bool log;
>>> +    struct domain_save_descriptor desc;
>>> +    domain_copy_entry copy;
>>
>> As your new framework is technically an extension of existing one, it
>> would be good to explain why we diverge in the definitions.
>>
> 
> I don't follow. What is diverging? I explain in the commit comment that this is a parallel framework. Do I need to justify why it is not a carbon copy of the HVM one?

Well, they are both restoring/saving guest state. The only difference is 
the existing one is focusing on HVM state.

So it would make sense long term to have only one hypercall and tell 
what you want to save. In fact, some of the improvement here would 
definitely make the HVM one nicer to use (at least in the context of LU).

 From the commit message, it is not clear to me why a new framework and 
why the infrastructure is at the same time different but not.

> 
>>> +    void *priv;
>>> +};
>>> +
>>> +static struct {
>>> +    const char *name;
>>> +    bool per_vcpu;
>>> +    domain_save_handler save;
>>> +    domain_load_handler load;
>>> +} handlers[DOMAIN_SAVE_CODE_MAX + 1];
>>> +
>>> +void __init domain_register_save_type(unsigned int tc, const char *name,
>>> +                                      bool per_vcpu,
>>> +                                      domain_save_handler save,
>>> +                                      domain_load_handler load)
>>> +{
>>> +    BUG_ON(tc > ARRAY_SIZE(handlers));
>>> +
>>> +    ASSERT(!handlers[tc].save);
>>> +    ASSERT(!handlers[tc].load);
>>> +
>>> +    handlers[tc].name = name;
>>> +    handlers[tc].per_vcpu = per_vcpu;
>>> +    handlers[tc].save = save;
>>> +    handlers[tc].load = load;
>>> +}
>>> +
>>> +int domain_save_entry(struct domain_context *c, unsigned int tc,
>>> +                      const char *name, const struct vcpu *v, void *src,
>>
>> I realize that 'src' is not const because how you define copy, however I
>> would rather prefer if we rework the interface to avoid to keep the
>> const in the definition. This may mean to have to define two callback
>> rather than one.
> 
> That seems a bit ugly for the sake of a const but I guess I could create a union with a copy_in and copy_out. I'll see how that looks.

I would really like to map the Live-Update as read-only (it is not meant 
to be modified by the new Xen) and therefore adding const here would 
enable us to catch most of the mistake at build rather than during runtime.
>> Why not adding an implicit padding? This would avoid to chase error
>> during saving because the len wasn't a multiple of 8.
>>
> 
> I don't think implicit padding is worth it. This error should only happen if a badly defined save record type is added to the code so perhaps I ought to add an ASSERT here as well as deal with the error.

I wish I would be able to say every error can be caught during review. 
Unfortunately this is not true.

If you define the size dynamically, then a misalignment may not be 
noticed until you go to prod. The implicit padding would at least allow 
you to be more resilient.

It may not be that bad as you would not be able to save the guest. 
Anyway, this would be nice a feature to have but not a must.

> 
>>> +        return -EINVAL;
>>> +
>>> +    BUG_ON(tc != c->desc.typecode);
>>> +    BUG_ON(v->vcpu_id != c->desc.instance);
>>
>> Does the descriptor really need to be filled by domain_save()? Can't it
>> be done here, so we can avoid the two BUG_ON()s?
>>
> 
> Yes it can but this serves as a sanity check that the save code is actually doing what it should. Hence why these are BUG_ON()s and not error exits.

This does not really answer to the question why the c->desc.instance and 
c->desc.typecode are not set here. What is the advantage to do it 
earlier on and still passing the information in parameters? Why not just 
setting them here?

> 
>>> +    c->desc.length = src_len;
>>> +
>>> +    rc = c->copy(c->priv, &c->desc, sizeof(c->desc));
>>> +    if ( rc )
>>> +        return rc;
>>> +
>>> +    return c->copy(c->priv, src, src_len);
>>> +}
>>> +
>>> +int domain_save(struct domain *d, domain_copy_entry copy, void *priv,
>>> +                unsigned long mask, bool dry_run)
>>> +{
>>> +    struct domain_context c = {
>>> +        .copy = copy,
>>> +        .priv = priv,
>>> +        .log = !dry_run,
>>> +    };
>>> +    struct domain_save_header h = {
>>> +        .magic = DOMAIN_SAVE_MAGIC,
>>> +        .version = DOMAIN_SAVE_VERSION,
>>> +    };
>>> +    struct domain_save_header e;
>>> +    unsigned int i;
>>> +    int rc;
>>> +
>>> +    ASSERT(d != current->domain);
>>> +
>>> +    if ( d->is_dying )
>>> +        return -EINVAL;
>>> +
>>> +    domain_pause(d);
>>> +
>>> +    c.desc.typecode = DOMAIN_SAVE_CODE(HEADER);
>>> +
>>> +    rc = DOMAIN_SAVE_ENTRY(HEADER, &c, d->vcpu[0], &h, sizeof(h));
>>> +    if ( rc )
>>> +        goto out;
>>> +
>>> +    for ( i = 0; i < ARRAY_SIZE(handlers); i++ )
>>
>> AFAIU, with this solution, if there are dependency between records, the
>> dependencies will have to a lower "index". I think we may have some
>> dependency with guest transparent migration such as we need to restore
>> the event ABI before restoring the event channels.
>>
> 
> Is that just a downstream implementation detail though? I would hope that there are no ordering dependencies between records.

Until now, I never managed to avoid ordering between records on the Live 
Update side.

In my previous e-mail, I suggested there are dependency between 
restoring the event ABI (such as control page...) and restoring the 
event channels. How do you suggest to solve it without imposing an order 
between records?

>>> +    {
>>> +        domain_save_handler save = handlers[i].save;
>>> +
>>> +        if ( (mask && !test_bit(i, &mask)) || !save )
>>
>> The type of mask suggests it is not possible to have more than 32
>> different types of record if we wanted to be arch agnostic. Even if we
>> focus on 64-bit arch, this is only 64 records.
>>
>> This is not very future proof, but might be ok if this is not exposed
>> outside of the hypervisor (I haven't looked at the rest of the series
>> yet). However, we at least need a BUILD_BUG_ON() in place. So please
>> make sure  DOMAIN_SAVE_CODE_MAX is always less than 64.
>>
>> Also what if there is a bit set in the mask and the record is not
>> existing? Shouldn't we return an error?
>>
> 
> TBH I think 32 will be enough... I would not expect this context space to grow very much, if at all, once transparent migration is working. I think I'll just drop the mask for now; it's not actually clear to me we'll make use of it... just seemed like a nice-to-have.

So far for Live-Update we have 20 records (not counting the Guest 
Tranparent one). We might be able to do without some, but we also didn't 
restore all the states yet.

> 
>>> +            continue;
>>> +
>>> +        memset(&c.desc, 0, sizeof(c.desc));
>>> +        c.desc.typecode = i;
>>> +
>>> +        if ( handlers[i].per_vcpu )
>>> +        {
>>> +            struct vcpu *v;
>>> +
>>> +            for_each_vcpu ( d, v )
>>> +            {
>>> +                c.desc.instance = v->vcpu_id;
>>> +
>>> +                rc = save(v, &c, dry_run);
>>> +                if ( rc )
>>> +                    goto out;
>>> +            }
>>> +        }
>>> +        else
>>> +        {
>>> +            rc = save(d->vcpu[0], &c, dry_run);
>>> +            if ( rc )
>>> +                goto out;
>>> +        }
>>> +    }
>>> +
>>> +    memset(&c.desc, 0, sizeof(c.desc));
>>> +    c.desc.typecode = DOMAIN_SAVE_CODE(END);
>>> +
>>> +    rc = DOMAIN_SAVE_ENTRY(END, &c, d->vcpu[0], &e, 0);
>>> +
>>> + out:
>>> +    domain_unpause(d);
>>> +
>>> +    return rc;
>>> +}
>>> +
>>> +int domain_load_entry(struct domain_context *c, unsigned int tc,
>>> +                      const char *name, const struct vcpu *v, void *dst,
>>> +                      size_t dst_len, bool exact)
>>> +{
>>> +    int rc;
>>> +
>>> +    if ( c->log && tc != DOMAIN_SAVE_CODE(HEADER) &&
>>> +         tc != DOMAIN_SAVE_CODE(END) )
>>> +        gdprintk(XENLOG_INFO, "%pv load: %s (%lu)\n", v, name, dst_len);
>>> +
>>> +    BUG_ON(tc != c->desc.typecode);
>>> +    BUG_ON(v->vcpu_id != c->desc.instance);
>>
>> Is it really warrant to crash the host? What would happen if the values
>> were wrong?
>>
> 
> It would mean the code is fairly seriously buggy in that the load handler is trying to load a record other than the type it registered for, or for a vcpu other than the one it was passed.

I understand that, but is warrant to crash the host? Couldn't you just 
return an error and continue to run?

> 
>>> +
>>> +    if ( (exact ?
>>> +          (dst_len != c->desc.length) : (dst_len < c->desc.length)) ||
>>
>> Using ternary in if is really confusing. How about:
>>
>> dst_len < c->desc.length || (exact && dst_len != c->desc.length) ||
>>
>> I understand that there would be two check for the exact case but I
>> think it is better than a ternary.
> 
> I'm going to re-work this I think.
> 
>>
>> However what is the purpose of the param 'exact'? If you set it to false
>> how do you know the size you read?
> 
> The point of the exact parameter is that whether the caller can only consume a record of the correct length, or whether it can handle an undersize one which gets zero-padded. (It's the same as the zeroextend option in HVM records).
> 
>>
>>> +         !IS_ALIGNED(c->desc.length, 8) )
>>> +        return -EINVAL;
>>> +
>>> +    rc = c->copy(c->priv, dst, c->desc.length);
>>> +    if ( rc )
>>> +        return rc;
>>> +
>>> +    if ( !exact )
>>> +    {
>>> +        dst += c->desc.length;
>>> +        memset(dst, 0, dst_len - c->desc.length);
>>> +    }
>>> +
>>> +    return 0;
>>> +}
>>> +
>>> +int domain_load(struct domain *d,  domain_copy_entry copy, void *priv,
>>> +                unsigned long mask)
>>> +{
>>> +    struct domain_context c = {
>>> +        .copy = copy,
>>> +        .priv = priv,
>>> +        .log = true,
>>> +    };
>>> +    struct domain_save_header h, e;
>>> +    int rc;
>>> +
>>> +    ASSERT(d != current->domain);
>>> +
>>> +    if ( d->is_dying )
>>> +        return -EINVAL;
>>
>> What does protect you against the doing dying right after this check?
>>
> 
> Nothing. It's just to avoid doing pointless work if we can.

I wasn't thinking about pointless work but whether the rest of the code 
is relying on a sound domain. Is it going to be fine?

[...]

>>> +/* Each entry is preceded by a descriptor */
>>> +struct domain_save_descriptor {
>>> +    uint16_t typecode;
>>> +    uint16_t instance;
>>> +    /*
>>> +     * Entry length not including this descriptor. Entries must be padded
>>> +     * to a multiple of 8 bytes to make sure descriptors remain correctly
>>> +     * aligned.
>>> +     */
>>> +    uint32_t length;
>>> +};
>>> +
>>> +/*
>>> + * Each entry has a type associated with it. DECLARE_DOMAIN_SAVE_TYPE
>>> + * binds these things together.
>>> + */
>>> +#define DECLARE_DOMAIN_SAVE_TYPE(_x, _code, _type) \
>>> +    struct __DOMAIN_SAVE_TYPE_##_x { _type t; char c[_code]; };
>>> +
>>> +#define DOMAIN_SAVE_TYPE(_x) \
>>> +    typeof (((struct __DOMAIN_SAVE_TYPE_##_x *)(0))->t)
>>> +#define DOMAIN_SAVE_CODE(_x) \
>>> +    (sizeof (((struct __DOMAIN_SAVE_TYPE_##_x *)(0))->c))
>>> +#define DOMAIN_SAVE_MASK(_x) (1u << DOMAIN_SAVE_CODE(_x))
>>> +
>>> +/* Terminating entry */
>>> +struct domain_save_end {};
>>> +DECLARE_DOMAIN_SAVE_TYPE(END, 0, struct domain_save_end);
>>> +
>>> +#define DOMAIN_SAVE_MAGIC   0x53415645
>>> +#define DOMAIN_SAVE_VERSION 0x00000001
>>> +
>>> +/* Initial entry */
>>> +struct domain_save_header {
>>> +    uint32_t magic;             /* Must be DOMAIN_SAVE_MAGIC */
>>> +    uint32_t version;           /* Save format version */
>>
>> Would it make sense to have the version of Xen in the stream?
>>
> 
> Why? How would it help?

Let's imagine in 4.14 we introduced a bug in the saving part. This is 
solved in 4.15 but somehow the version is not bumped. How would you 
differentiate the streams saved by Xen 4.14 so you can still migrate?

If you record the version of Xen in the record automatically, then you 
at least have a way to differentiate the two versions.

Cheers,
Paul Durrant April 6, 2020, 8:27 a.m. UTC | #12
> -----Original Message-----
> From: Julien Grall <julien@xen.org>
> Sent: 03 April 2020 18:24
> To: paul@xen.org; xen-devel@lists.xenproject.org
> Cc: 'Andrew Cooper' <andrew.cooper3@citrix.com>; 'George Dunlap' <george.dunlap@citrix.com>; 'Ian
> Jackson' <ian.jackson@eu.citrix.com>; 'Jan Beulich' <jbeulich@suse.com>; 'Stefano Stabellini'
> <sstabellini@kernel.org>; 'Wei Liu' <wl@xen.org>
> Subject: Re: [PATCH 1/5] xen/common: introduce a new framework for save/restore of 'domain' context
> 
> Hi Paul,
> 
> On 03/04/2020 16:55, Paul Durrant wrote:
> >> -----Original Message-----
> > [snip]
> >>> +
> >>> +#include <xen/save.h>
> >>> +
> >>> +struct domain_context {
> >>> +    bool log;
> >>> +    struct domain_save_descriptor desc;
> >>> +    domain_copy_entry copy;
> >>
> >> As your new framework is technically an extension of existing one, it
> >> would be good to explain why we diverge in the definitions.
> >>
> >
> > I don't follow. What is diverging? I explain in the commit comment that this is a parallel
> framework. Do I need to justify why it is not a carbon copy of the HVM one?
> 
> Well, they are both restoring/saving guest state. The only difference is
> the existing one is focusing on HVM state.
> 
> So it would make sense long term to have only one hypercall and tell
> what you want to save. In fact, some of the improvement here would
> definitely make the HVM one nicer to use (at least in the context of LU).
> 

I guess we could move the HVM save records over to the new framework, but it works for the moment so I don't want to bring it into scope now.

>  From the commit message, it is not clear to me why a new framework and
> why the infrastructure is at the same time different but not.
> 

An alternative would be to move the HVM save code into common code and then try to adapt it. I think that would result in more code churn and ultimately be harder to review. The extra infrastructure introduced here is fairly minimal and, for the moment, only targeting PV state. As I said above there's nothing stopping the HVM records being ported over later once any initial issues have been shaken out.

  Paul
Julien Grall April 6, 2020, 9:08 a.m. UTC | #13
Hi Paul,

On 06/04/2020 09:27, Paul Durrant wrote:
>> -----Original Message-----
>> From: Julien Grall <julien@xen.org>
>> Sent: 03 April 2020 18:24
>> To: paul@xen.org; xen-devel@lists.xenproject.org
>> Cc: 'Andrew Cooper' <andrew.cooper3@citrix.com>; 'George Dunlap' <george.dunlap@citrix.com>; 'Ian
>> Jackson' <ian.jackson@eu.citrix.com>; 'Jan Beulich' <jbeulich@suse.com>; 'Stefano Stabellini'
>> <sstabellini@kernel.org>; 'Wei Liu' <wl@xen.org>
>> Subject: Re: [PATCH 1/5] xen/common: introduce a new framework for save/restore of 'domain' context
>>
>> Hi Paul,
>>
>> On 03/04/2020 16:55, Paul Durrant wrote:
>>>> -----Original Message-----
>>> [snip]
>>>>> +
>>>>> +#include <xen/save.h>
>>>>> +
>>>>> +struct domain_context {
>>>>> +    bool log;
>>>>> +    struct domain_save_descriptor desc;
>>>>> +    domain_copy_entry copy;
>>>>
>>>> As your new framework is technically an extension of existing one, it
>>>> would be good to explain why we diverge in the definitions.
>>>>
>>>
>>> I don't follow. What is diverging? I explain in the commit comment that this is a parallel
>> framework. Do I need to justify why it is not a carbon copy of the HVM one?
>>
>> Well, they are both restoring/saving guest state. The only difference is
>> the existing one is focusing on HVM state.
>>
>> So it would make sense long term to have only one hypercall and tell
>> what you want to save. In fact, some of the improvement here would
>> definitely make the HVM one nicer to use (at least in the context of LU).
>>
> 
> I guess we could move the HVM save records over to the new framework, but it works for the moment so I don't want to bring it into scope now.

And I agree, hence why I say "long term" :).

> 
>>   From the commit message, it is not clear to me why a new framework and
>> why the infrastructure is at the same time different but not.
>>
> 
> An alternative would be to move the HVM save code into common code and then try to adapt it. I think that would result in more code churn and ultimately be harder to review. The extra infrastructure introduced here is fairly minimal and, for the moment, only targeting PV state. As I said above there's nothing stopping the HVM records being ported over later once any initial issues have been shaken out.

Code churn is always going to necessary one day or another if we want to 
consolidate the two.

Having two frameworks is not without risks. There are a few unknown to 
be answered:
   * Is there any dependency between the two? If yes, what is the ordering?
   * The name of the hypercall does not say anything about "PV". So a 
contributor could think it would be fine to save/restore new HVM state 
in the new generic hypercall. Is it going to be an issue? If so, how do 
we prevent it?
   * Are we going to deprecate the existing framework?

I am not suggesting we should not go with two frameworks, but the 
reasons and implications are not clear to me. Hence, why I think the 
commit message should be expanded with some rationale.

Cheers,
Paul Durrant April 6, 2020, 9:18 a.m. UTC | #14
> -----Original Message-----
> From: Julien Grall <julien@xen.org>
> Sent: 06 April 2020 10:08
> To: paul@xen.org; xen-devel@lists.xenproject.org
> Cc: 'Andrew Cooper' <andrew.cooper3@citrix.com>; 'George Dunlap' <george.dunlap@citrix.com>; 'Ian
> Jackson' <ian.jackson@eu.citrix.com>; 'Jan Beulich' <jbeulich@suse.com>; 'Stefano Stabellini'
> <sstabellini@kernel.org>; 'Wei Liu' <wl@xen.org>
> Subject: Re: [PATCH 1/5] xen/common: introduce a new framework for save/restore of 'domain' context
> 
> Hi Paul,
> 
> On 06/04/2020 09:27, Paul Durrant wrote:
> >> -----Original Message-----
> >> From: Julien Grall <julien@xen.org>
> >> Sent: 03 April 2020 18:24
> >> To: paul@xen.org; xen-devel@lists.xenproject.org
> >> Cc: 'Andrew Cooper' <andrew.cooper3@citrix.com>; 'George Dunlap' <george.dunlap@citrix.com>; 'Ian
> >> Jackson' <ian.jackson@eu.citrix.com>; 'Jan Beulich' <jbeulich@suse.com>; 'Stefano Stabellini'
> >> <sstabellini@kernel.org>; 'Wei Liu' <wl@xen.org>
> >> Subject: Re: [PATCH 1/5] xen/common: introduce a new framework for save/restore of 'domain' context
> >>
> >> Hi Paul,
> >>
> >> On 03/04/2020 16:55, Paul Durrant wrote:
> >>>> -----Original Message-----
> >>> [snip]
> >>>>> +
> >>>>> +#include <xen/save.h>
> >>>>> +
> >>>>> +struct domain_context {
> >>>>> +    bool log;
> >>>>> +    struct domain_save_descriptor desc;
> >>>>> +    domain_copy_entry copy;
> >>>>
> >>>> As your new framework is technically an extension of existing one, it
> >>>> would be good to explain why we diverge in the definitions.
> >>>>
> >>>
> >>> I don't follow. What is diverging? I explain in the commit comment that this is a parallel
> >> framework. Do I need to justify why it is not a carbon copy of the HVM one?
> >>
> >> Well, they are both restoring/saving guest state. The only difference is
> >> the existing one is focusing on HVM state.
> >>
> >> So it would make sense long term to have only one hypercall and tell
> >> what you want to save. In fact, some of the improvement here would
> >> definitely make the HVM one nicer to use (at least in the context of LU).
> >>
> >
> > I guess we could move the HVM save records over to the new framework, but it works for the moment so
> I don't want to bring it into scope now.
> 
> And I agree, hence why I say "long term" :).
> 
> >
> >>   From the commit message, it is not clear to me why a new framework and
> >> why the infrastructure is at the same time different but not.
> >>
> >
> > An alternative would be to move the HVM save code into common code and then try to adapt it. I think
> that would result in more code churn and ultimately be harder to review. The extra infrastructure
> introduced here is fairly minimal and, for the moment, only targeting PV state. As I said above
> there's nothing stopping the HVM records being ported over later once any initial issues have been
> shaken out.
> 
> Code churn is always going to necessary one day or another if we want to
> consolidate the two.
> 
> Having two frameworks is not without risks. There are a few unknown to
> be answered:
>    * Is there any dependency between the two? If yes, what is the ordering?

There isn't any dependency at the moment so need I say anything? If an ordering dependency is introduced by a future patch then surely that would be time to call it out.

>    * The name of the hypercall does not say anything about "PV". So a
> contributor could think it would be fine to save/restore new HVM state
> in the new generic hypercall. Is it going to be an issue? If so, how do
> we prevent it?

The commit message says:

"Domain context is state held in the hypervisor that does not come under
the category of 'HVM state' but is instead 'PV state' that is common
between PV guests and enlightened HVM guests (i.e. those that have PV
drivers) such as event channel state, grant entry state, etc."

Do you think this should also appear in a comment? Do we really care? Nothing fundamentally prevents the mechanism being used for HVM state, but that may introduce an ordering dependency.

>    * Are we going to deprecate the existing framework?
> 

There is no intention as yet.

> I am not suggesting we should not go with two frameworks, but the
> reasons and implications are not clear to me. Hence, why I think the
> commit message should be expanded with some rationale.
> 

Ok, I can add a paragraph to try to explain a little more.

  Paul
Julien Grall April 6, 2020, 9:50 a.m. UTC | #15
Hi Paul,

On 06/04/2020 10:18, Paul Durrant wrote:
>> -----Original Message-----
>> From: Julien Grall <julien@xen.org>
>> Sent: 06 April 2020 10:08
>> To: paul@xen.org; xen-devel@lists.xenproject.org
>> Cc: 'Andrew Cooper' <andrew.cooper3@citrix.com>; 'George Dunlap' <george.dunlap@citrix.com>; 'Ian
>> Jackson' <ian.jackson@eu.citrix.com>; 'Jan Beulich' <jbeulich@suse.com>; 'Stefano Stabellini'
>> <sstabellini@kernel.org>; 'Wei Liu' <wl@xen.org>
>> Subject: Re: [PATCH 1/5] xen/common: introduce a new framework for save/restore of 'domain' context
>>
>> Hi Paul,
>>
>> On 06/04/2020 09:27, Paul Durrant wrote:
>>>> -----Original Message-----
>>>> From: Julien Grall <julien@xen.org>
>>>> Sent: 03 April 2020 18:24
>>>> To: paul@xen.org; xen-devel@lists.xenproject.org
>>>> Cc: 'Andrew Cooper' <andrew.cooper3@citrix.com>; 'George Dunlap' <george.dunlap@citrix.com>; 'Ian
>>>> Jackson' <ian.jackson@eu.citrix.com>; 'Jan Beulich' <jbeulich@suse.com>; 'Stefano Stabellini'
>>>> <sstabellini@kernel.org>; 'Wei Liu' <wl@xen.org>
>>>> Subject: Re: [PATCH 1/5] xen/common: introduce a new framework for save/restore of 'domain' context
>>>>
>>>> Hi Paul,
>>>>
>>>> On 03/04/2020 16:55, Paul Durrant wrote:
>>>>>> -----Original Message-----
>>>>> [snip]
>>>>>>> +
>>>>>>> +#include <xen/save.h>
>>>>>>> +
>>>>>>> +struct domain_context {
>>>>>>> +    bool log;
>>>>>>> +    struct domain_save_descriptor desc;
>>>>>>> +    domain_copy_entry copy;
>>>>>>
>>>>>> As your new framework is technically an extension of existing one, it
>>>>>> would be good to explain why we diverge in the definitions.
>>>>>>
>>>>>
>>>>> I don't follow. What is diverging? I explain in the commit comment that this is a parallel
>>>> framework. Do I need to justify why it is not a carbon copy of the HVM one?
>>>>
>>>> Well, they are both restoring/saving guest state. The only difference is
>>>> the existing one is focusing on HVM state.
>>>>
>>>> So it would make sense long term to have only one hypercall and tell
>>>> what you want to save. In fact, some of the improvement here would
>>>> definitely make the HVM one nicer to use (at least in the context of LU).
>>>>
>>>
>>> I guess we could move the HVM save records over to the new framework, but it works for the moment so
>> I don't want to bring it into scope now.
>>
>> And I agree, hence why I say "long term" :).
>>
>>>
>>>>    From the commit message, it is not clear to me why a new framework and
>>>> why the infrastructure is at the same time different but not.
>>>>
>>>
>>> An alternative would be to move the HVM save code into common code and then try to adapt it. I think
>> that would result in more code churn and ultimately be harder to review. The extra infrastructure
>> introduced here is fairly minimal and, for the moment, only targeting PV state. As I said above
>> there's nothing stopping the HVM records being ported over later once any initial issues have been
>> shaken out.
>>
>> Code churn is always going to necessary one day or another if we want to
>> consolidate the two.
>>
>> Having two frameworks is not without risks. There are a few unknown to
>> be answered:
>>     * Is there any dependency between the two? If yes, what is the ordering?
> 
> There isn't any dependency at the moment so need I say anything? If an ordering dependency is introduced by a future patch then surely that would be time to call it out.

If we don't spell out a dependency now, then the risk is we have to 
modify the toolstack at the same time as spelling out the dependency.

I am not sure whether this matters thought. This would only affect the 
save part as the restore part should be just reading records one by one.

> 
>>     * The name of the hypercall does not say anything about "PV". So a
>> contributor could think it would be fine to save/restore new HVM state
>> in the new generic hypercall. Is it going to be an issue? If so, how do
>> we prevent it?
> 
> The commit message says:
> 
> "Domain context is state held in the hypervisor that does not come under
> the category of 'HVM state' but is instead 'PV state' that is common
> between PV guests and enlightened HVM guests (i.e. those that have PV
> drivers) such as event channel state, grant entry state, etc."

This does not seem to cover all the cases. For instance, where would you 
save IOREQ servers information?

> 
> Do you think this should also appear in a comment? Do we really care? Nothing fundamentally prevents the mechanism being used for HVM state, but that may introduce an ordering dependency.

I don't particularly like the idea to let the contributor decide where 
"HVM context" or as part of the "Domain context".

This is  going to result to unwanted dependency and possibly 
bikeshedding on where they should be saved.

My preference would be to mark the existing framework as deprecated and 
force all the new states to be saved as part of the new "Domain Context".

> 
>>     * Are we going to deprecate the existing framework?
>>
> 
> There is no intention as yet.
> 
>> I am not suggesting we should not go with two frameworks, but the
>> reasons and implications are not clear to me. Hence, why I think the
>> commit message should be expanded with some rationale.
>>
> 
> Ok, I can add a paragraph to try to explain a little more.

That would be appreciated. Thank you!

Cheers,
Paul Durrant April 6, 2020, 10:34 a.m. UTC | #16
> -----Original Message-----
[snip]
> >>     * The name of the hypercall does not say anything about "PV". So a
> >> contributor could think it would be fine to save/restore new HVM state
> >> in the new generic hypercall. Is it going to be an issue? If so, how do
> >> we prevent it?
> >
> > The commit message says:
> >
> > "Domain context is state held in the hypervisor that does not come under
> > the category of 'HVM state' but is instead 'PV state' that is common
> > between PV guests and enlightened HVM guests (i.e. those that have PV
> > drivers) such as event channel state, grant entry state, etc."
> 
> This does not seem to cover all the cases. For instance, where would you
> save IOREQ servers information?
> 

Ok, I agree that is ambiguous. I'd still call it PV state but of course it does only relate to HVM guests.

> >
> > Do you think this should also appear in a comment? Do we really care? Nothing fundamentally prevents
> the mechanism being used for HVM state, but that may introduce an ordering dependency.
> 
> I don't particularly like the idea to let the contributor decide where
> "HVM context" or as part of the "Domain context".
> 
> This is  going to result to unwanted dependency and possibly
> bikeshedding on where they should be saved.
> 
> My preference would be to mark the existing framework as deprecated and
> force all the new states to be saved as part of the new "Domain Context".
> 

I'm ok with that. Any others have any opinion to the contrary?

> >
> >>     * Are we going to deprecate the existing framework?
> >>
> >
> > There is no intention as yet.
> >
> >> I am not suggesting we should not go with two frameworks, but the
> >> reasons and implications are not clear to me. Hence, why I think the
> >> commit message should be expanded with some rationale.
> >>
> >
> > Ok, I can add a paragraph to try to explain a little more.
> 
> That would be appreciated. Thank you!
> 

I'll mention the deprecation of the HVM context interface there too.

  Paul
Jan Beulich April 6, 2020, 12:43 p.m. UTC | #17
On 06.04.2020 12:34, Paul Durrant wrote:
>>> Do you think this should also appear in a comment? Do we really care? Nothing fundamentally prevents
>> the mechanism being used for HVM state, but that may introduce an ordering dependency.
>>
>> I don't particularly like the idea to let the contributor decide where
>> "HVM context" or as part of the "Domain context".
>>
>> This is  going to result to unwanted dependency and possibly
>> bikeshedding on where they should be saved.
>>
>> My preference would be to mark the existing framework as deprecated and
>> force all the new states to be saved as part of the new "Domain Context".
> 
> I'm ok with that. Any others have any opinion to the contrary?

Well, not in principle, but I won't have a firm opinion until I've
actually looked at (relevant parts of) the patches.

Jan
diff mbox series

Patch

diff --git a/xen/common/Makefile b/xen/common/Makefile
index e8cde65370..90553ba5d7 100644
--- a/xen/common/Makefile
+++ b/xen/common/Makefile
@@ -37,6 +37,7 @@  obj-y += radix-tree.o
 obj-y += rbtree.o
 obj-y += rcupdate.o
 obj-y += rwlock.o
+obj-y += save.o
 obj-y += shutdown.o
 obj-y += softirq.o
 obj-y += sort.o
diff --git a/xen/common/save.c b/xen/common/save.c
new file mode 100644
index 0000000000..bef99452d8
--- /dev/null
+++ b/xen/common/save.c
@@ -0,0 +1,262 @@ 
+/*
+ * save.c: Save and restore PV guest state common to all domain types.
+ *
+ * Copyright Amazon.com Inc. or its affiliates.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <xen/save.h>
+
+struct domain_context {
+    bool log;
+    struct domain_save_descriptor desc;
+    domain_copy_entry copy;
+    void *priv;
+};
+
+static struct {
+    const char *name;
+    bool per_vcpu;
+    domain_save_handler save;
+    domain_load_handler load;
+} handlers[DOMAIN_SAVE_CODE_MAX + 1];
+
+void __init domain_register_save_type(unsigned int tc, const char *name,
+                                      bool per_vcpu,
+                                      domain_save_handler save,
+                                      domain_load_handler load)
+{
+    BUG_ON(tc > ARRAY_SIZE(handlers));
+
+    ASSERT(!handlers[tc].save);
+    ASSERT(!handlers[tc].load);
+
+    handlers[tc].name = name;
+    handlers[tc].per_vcpu = per_vcpu;
+    handlers[tc].save = save;
+    handlers[tc].load = load;
+}
+
+int domain_save_entry(struct domain_context *c, unsigned int tc,
+                      const char *name, const struct vcpu *v, void *src,
+                      size_t src_len)
+{
+    int rc;
+
+    if ( c->log && tc != DOMAIN_SAVE_CODE(HEADER) &&
+         tc != DOMAIN_SAVE_CODE(END) )
+        gdprintk(XENLOG_INFO, "%pv save: %s (%lu)\n", v, name, src_len);
+
+    if ( !IS_ALIGNED(src_len, 8) )
+        return -EINVAL;
+
+    BUG_ON(tc != c->desc.typecode);
+    BUG_ON(v->vcpu_id != c->desc.instance);
+    c->desc.length = src_len;
+
+    rc = c->copy(c->priv, &c->desc, sizeof(c->desc));
+    if ( rc )
+        return rc;
+
+    return c->copy(c->priv, src, src_len);
+}
+
+int domain_save(struct domain *d, domain_copy_entry copy, void *priv,
+                unsigned long mask, bool dry_run)
+{
+    struct domain_context c = {
+        .copy = copy,
+        .priv = priv,
+        .log = !dry_run,
+    };
+    struct domain_save_header h = {
+        .magic = DOMAIN_SAVE_MAGIC,
+        .version = DOMAIN_SAVE_VERSION,
+    };
+    struct domain_save_header e;
+    unsigned int i;
+    int rc;
+
+    ASSERT(d != current->domain);
+
+    if ( d->is_dying )
+        return -EINVAL;
+
+    domain_pause(d);
+
+    c.desc.typecode = DOMAIN_SAVE_CODE(HEADER);
+
+    rc = DOMAIN_SAVE_ENTRY(HEADER, &c, d->vcpu[0], &h, sizeof(h));
+    if ( rc )
+        goto out;
+
+    for ( i = 0; i < ARRAY_SIZE(handlers); i++ )
+    {
+        domain_save_handler save = handlers[i].save;
+
+        if ( (mask && !test_bit(i, &mask)) || !save )
+            continue;
+
+        memset(&c.desc, 0, sizeof(c.desc));
+        c.desc.typecode = i;
+
+        if ( handlers[i].per_vcpu )
+        {
+            struct vcpu *v;
+
+            for_each_vcpu ( d, v )
+            {
+                c.desc.instance = v->vcpu_id;
+
+                rc = save(v, &c, dry_run);
+                if ( rc )
+                    goto out;
+            }
+        }
+        else
+        {
+            rc = save(d->vcpu[0], &c, dry_run);
+            if ( rc )
+                goto out;
+        }
+    }
+
+    memset(&c.desc, 0, sizeof(c.desc));
+    c.desc.typecode = DOMAIN_SAVE_CODE(END);
+
+    rc = DOMAIN_SAVE_ENTRY(END, &c, d->vcpu[0], &e, 0);
+
+ out:
+    domain_unpause(d);
+
+    return rc;
+}
+
+int domain_load_entry(struct domain_context *c, unsigned int tc,
+                      const char *name, const struct vcpu *v, void *dst,
+                      size_t dst_len, bool exact)
+{
+    int rc;
+
+    if ( c->log && tc != DOMAIN_SAVE_CODE(HEADER) &&
+         tc != DOMAIN_SAVE_CODE(END) )
+        gdprintk(XENLOG_INFO, "%pv load: %s (%lu)\n", v, name, dst_len);
+
+    BUG_ON(tc != c->desc.typecode);
+    BUG_ON(v->vcpu_id != c->desc.instance);
+
+    if ( (exact ?
+          (dst_len != c->desc.length) : (dst_len < c->desc.length)) ||
+         !IS_ALIGNED(c->desc.length, 8) )
+        return -EINVAL;
+
+    rc = c->copy(c->priv, dst, c->desc.length);
+    if ( rc )
+        return rc;
+
+    if ( !exact )
+    {
+        dst += c->desc.length;
+        memset(dst, 0, dst_len - c->desc.length);
+    }
+
+    return 0;
+}
+
+int domain_load(struct domain *d,  domain_copy_entry copy, void *priv,
+                unsigned long mask)
+{
+    struct domain_context c = {
+        .copy = copy,
+        .priv = priv,
+        .log = true,
+    };
+    struct domain_save_header h, e;
+    int rc;
+
+    ASSERT(d != current->domain);
+
+    if ( d->is_dying )
+        return -EINVAL;
+
+    rc = c.copy(c.priv, &c.desc, sizeof(c.desc));
+    if ( rc )
+        return rc;
+
+    if ( c.desc.typecode != DOMAIN_SAVE_CODE(HEADER) ||
+         c.desc.instance != 0 )
+        return -EINVAL;
+
+    rc = DOMAIN_LOAD_ENTRY(HEADER, &c, d->vcpu[0], &h, sizeof(h), true);
+    if ( rc )
+        return rc;
+
+    if ( h.magic != DOMAIN_SAVE_MAGIC || h.version != DOMAIN_SAVE_VERSION )
+        return -EINVAL;
+
+    domain_pause(d);
+
+    for (;;)
+    {
+        unsigned int i;
+        domain_load_handler load;
+        struct vcpu *v;
+
+        rc = c.copy(c.priv, &c.desc, sizeof(c.desc));
+        if ( rc )
+            break;
+
+        if ( c.desc.typecode == DOMAIN_SAVE_CODE(END) ) {
+            rc = DOMAIN_LOAD_ENTRY(END, &c, d->vcpu[0], &e, 0, true);
+            break;
+        }
+
+        rc = -EINVAL;
+        if ( c.desc.typecode >= ARRAY_SIZE(handlers) ||
+             c.desc.instance >= d->max_vcpus )
+            break;
+
+        i = c.desc.typecode;
+        load = handlers[i].load;
+        v = d->vcpu[c.desc.instance];
+
+        if ( mask && !test_bit(i, &mask) )
+        {
+            /* Sink the data */
+            rc = c.copy(c.priv, NULL, c.desc.length);
+            if ( rc )
+                break;
+
+            continue;
+        }
+
+        rc = load ? load(v, &c) : -EOPNOTSUPP;
+        if ( rc )
+            break;
+    }
+
+    domain_unpause(d);
+
+    return rc;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/include/public/save.h b/xen/include/public/save.h
new file mode 100644
index 0000000000..84981cd0f6
--- /dev/null
+++ b/xen/include/public/save.h
@@ -0,0 +1,74 @@ 
+/*
+ * save.h
+ *
+ * Structure definitions for common PV/HVM domain state that is held by
+ * Xen and must be saved along with the domain's memory.
+ *
+ * Copyright Amazon.com Inc. or its affiliates.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __XEN_PUBLIC_SAVE_H__
+#define __XEN_PUBLIC_SAVE_H__
+
+#include "xen.h"
+
+/* Each entry is preceded by a descriptor */
+struct domain_save_descriptor {
+    uint16_t typecode;
+    uint16_t instance;
+    /*
+     * Entry length not including this descriptor. Entries must be padded
+     * to a multiple of 8 bytes to make sure descriptors remain correctly
+     * aligned.
+     */
+    uint32_t length;
+};
+
+/*
+ * Each entry has a type associated with it. DECLARE_DOMAIN_SAVE_TYPE
+ * binds these things together.
+ */
+#define DECLARE_DOMAIN_SAVE_TYPE(_x, _code, _type) \
+    struct __DOMAIN_SAVE_TYPE_##_x { _type t; char c[_code]; };
+
+#define DOMAIN_SAVE_TYPE(_x) \
+    typeof (((struct __DOMAIN_SAVE_TYPE_##_x *)(0))->t)
+#define DOMAIN_SAVE_CODE(_x) \
+    (sizeof (((struct __DOMAIN_SAVE_TYPE_##_x *)(0))->c))
+#define DOMAIN_SAVE_MASK(_x) (1u << DOMAIN_SAVE_CODE(_x))
+
+/* Terminating entry */
+struct domain_save_end {};
+DECLARE_DOMAIN_SAVE_TYPE(END, 0, struct domain_save_end);
+
+#define DOMAIN_SAVE_MAGIC   0x53415645
+#define DOMAIN_SAVE_VERSION 0x00000001
+
+/* Initial entry */
+struct domain_save_header {
+    uint32_t magic;             /* Must be DOMAIN_SAVE_MAGIC */
+    uint32_t version;           /* Save format version */
+};
+DECLARE_DOMAIN_SAVE_TYPE(HEADER, 1, struct domain_save_header);
+
+#define DOMAIN_SAVE_CODE_MAX 1
+
+#endif /* __XEN_PUBLIC_SAVE_H__ */
diff --git a/xen/include/xen/save.h b/xen/include/xen/save.h
new file mode 100644
index 0000000000..d5846f9e68
--- /dev/null
+++ b/xen/include/xen/save.h
@@ -0,0 +1,115 @@ 
+/*
+ * save.h: support routines for save/restore
+ *
+ * Copyright Amazon.com Inc. or its affiliates.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __XEN_SAVE_H__
+#define __XEN_SAVE_H__
+
+#include <xen/sched.h>
+#include <xen/types.h>
+#include <xen/init.h>
+
+#include <public/xen.h>
+#include <public/save.h>
+
+struct domain_context;
+
+int domain_save_entry(struct domain_context *c, unsigned int tc,
+                      const char *name, const struct vcpu *v, void *src,
+                      size_t src_len);
+
+#define DOMAIN_SAVE_ENTRY(_x, _c, _v, _src, _len)                        \
+        domain_save_entry((_c), DOMAIN_SAVE_CODE(_x), #_x, (_v), (_src), \
+                          (_len));
+
+int domain_load_entry(struct domain_context *c, unsigned int tc,
+                      const char *name, const struct vcpu *v, void *dest,
+                      size_t dest_len, bool exact);
+
+#define DOMAIN_LOAD_ENTRY(_x, _c, _v, _src, _len, _exact)                \
+        domain_load_entry((_c), DOMAIN_SAVE_CODE(_x), #_x, (_v), (_src), \
+                          (_len), (_exact));
+
+/*
+ * The 'dry_run' flag indicates that the caller of domain_save() (see
+ * below) is not trying to actually acquire the data, only the size
+ * of the data. The save handler can therefore limit work to only that
+ * which is necessary to call DOMAIN_SAVE_ENTRY() with an accurate value
+ * for '_len'.
+ */
+typedef int (*domain_save_handler)(const struct vcpu *v,
+                                   struct domain_context *h,
+                                   bool dry_run);
+typedef int (*domain_load_handler)(struct vcpu *v,
+                                   struct domain_context *h);
+
+void domain_register_save_type(unsigned int tc, const char *name,
+                               bool per_vcpu,
+                               domain_save_handler save,
+                               domain_load_handler load);
+
+#define DOMAIN_REGISTER_SAVE_RESTORE(_x, _per_vcpu, _save, _load) \
+static int __init __domain_register_##_x##_save_restore(void)     \
+{                                                                 \
+    domain_register_save_type(                                    \
+        DOMAIN_SAVE_CODE(_x),                                     \
+        #_x,                                                      \
+        (_per_vcpu),                                              \
+        &(_save),                                                 \
+        &(_load));                                                \
+                                                                  \
+    return 0;                                                     \
+}                                                                 \
+__initcall(__domain_register_##_x##_save_restore);
+
+/* Copy callback function */
+typedef int (*domain_copy_entry)(void *priv, void *data, size_t len);
+
+/*
+ * Entry points:
+ *
+ * int domain_save(struct domain *d, domain_copy_entry copy, void *priv,
+ *                 unsigned long mask, bool dry_run);
+ * int domain_load(struct domain *d, domain_copy_entry copy, void *priv,
+ *                 unsigned long mask);
+ *
+ * copy:    This is a callback function provided by the caller that will be
+ *          used to write to (in the save case) or read from (in the load
+ *          case) the context buffer.
+ * priv:    This is a pointer that will be passed to the copy function to
+ *          allow it to identify the context buffer and the current state
+ *          of the save or load operation.
+ * mask:    This is a mask to determine which save record types should be
+ *          copied to or from the buffer.
+ *          If it is zero then all save record types will be copied.
+ *          If it is non-zero then only record types with codes
+ *          corresponding to set bits will be copied. I.e. to copy save
+ *          record 'type', set the bit in position DOMAIN_SAVE_CODE(type).
+ *          DOMAIN_SAVE_CODE(HEADER) and DOMAIN_SAVE_CODE(END) records must
+ *          always be present and thus will be copied regardless of whether
+ *          the bits in those positions are set or not.
+ * dry_run: See the comment concerning (*domain_save) above.
+ *
+ * NOTE: A convenience macro, DOMAIN_SAVE_MASK(type), is defined to support
+ *       setting bits in the mask field.
+ */
+int domain_save(struct domain *d, domain_copy_entry copy, void *priv,
+                unsigned long mask, bool dry_run);
+int domain_load(struct domain *d, domain_copy_entry copy, void *priv,
+                unsigned long mask);
+
+#endif /* __XEN_SAVE_H__ */