diff mbox series

[v4,2/6] introduce UFFD-WP low-level interface helpers

Message ID 20201126151734.743849-3-andrey.gruzdev@virtuozzo.com (mailing list archive)
State New, archived
Headers show
Series UFFD write-tracking migration/snapshots | expand

Commit Message

Andrey Gruzdev Nov. 26, 2020, 3:17 p.m. UTC
Implemented support for the whole RAM block memory
protection/un-protection. Introduced higher level
ram_write_tracking_start() and ram_write_tracking_stop()
to start/stop tracking guest memory writes.

Signed-off-by: Andrey Gruzdev <andrey.gruzdev@virtuozzo.com>
---
 include/exec/memory.h      |   7 ++
 include/qemu/userfaultfd.h |  29 +++++
 migration/ram.c            | 120 +++++++++++++++++++++
 migration/ram.h            |   4 +
 util/meson.build           |   1 +
 util/userfaultfd.c         | 215 +++++++++++++++++++++++++++++++++++++
 6 files changed, 376 insertions(+)
 create mode 100644 include/qemu/userfaultfd.h
 create mode 100644 util/userfaultfd.c

Comments

Peter Xu Nov. 27, 2020, 9:04 p.m. UTC | #1
On Thu, Nov 26, 2020 at 06:17:30PM +0300, Andrey Gruzdev wrote:
> Implemented support for the whole RAM block memory
> protection/un-protection. Introduced higher level
> ram_write_tracking_start() and ram_write_tracking_stop()
> to start/stop tracking guest memory writes.

The whole patch looks good to me in general.  A few nitpickings below..

> 
> Signed-off-by: Andrey Gruzdev <andrey.gruzdev@virtuozzo.com>
> ---
>  include/exec/memory.h      |   7 ++
>  include/qemu/userfaultfd.h |  29 +++++
>  migration/ram.c            | 120 +++++++++++++++++++++
>  migration/ram.h            |   4 +
>  util/meson.build           |   1 +
>  util/userfaultfd.c         | 215 +++++++++++++++++++++++++++++++++++++
>  6 files changed, 376 insertions(+)
>  create mode 100644 include/qemu/userfaultfd.h
>  create mode 100644 util/userfaultfd.c
> 
> diff --git a/include/exec/memory.h b/include/exec/memory.h
> index 0f3e6bcd5e..3d798fce16 100644
> --- a/include/exec/memory.h
> +++ b/include/exec/memory.h
> @@ -139,6 +139,13 @@ typedef struct IOMMUNotifier IOMMUNotifier;
>  /* RAM is a persistent kind memory */
>  #define RAM_PMEM (1 << 5)
>  
> +/*
> + * UFFDIO_WRITEPROTECT is used on this RAMBlock to
> + * support 'write-tracking' migration type.
> + * Implies ram_state->ram_wt_enabled.
> + */
> +#define RAM_UF_WRITEPROTECT (1 << 6)
> +
>  static inline void iommu_notifier_init(IOMMUNotifier *n, IOMMUNotify fn,
>                                         IOMMUNotifierFlag flags,
>                                         hwaddr start, hwaddr end,
> diff --git a/include/qemu/userfaultfd.h b/include/qemu/userfaultfd.h
> new file mode 100644
> index 0000000000..fb843c76db
> --- /dev/null
> +++ b/include/qemu/userfaultfd.h
> @@ -0,0 +1,29 @@
> +/*
> + * Linux UFFD-WP support
> + *
> + * Copyright Virtuozzo GmbH, 2020
> + *
> + * Authors:
> + *  Andrey Gruzdev   <andrey.gruzdev@virtuozzo.com>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or
> + * later.  See the COPYING file in the top-level directory.
> + */
> +
> +#ifndef USERFAULTFD_H
> +#define USERFAULTFD_H
> +
> +#include "qemu/osdep.h"
> +#include "exec/hwaddr.h"
> +#include <linux/userfaultfd.h>
> +
> +int uffd_create_fd(void);
> +void uffd_close_fd(int uffd);
> +int uffd_register_memory(int uffd, hwaddr start, hwaddr length,
> +        bool track_missing, bool track_wp);
> +int uffd_unregister_memory(int uffd, hwaddr start, hwaddr length);
> +int uffd_protect_memory(int uffd, hwaddr start, hwaddr length, bool wp);
> +int uffd_read_events(int uffd, struct uffd_msg *msgs, int count);
> +bool uffd_poll_events(int uffd, int tmo);
> +
> +#endif /* USERFAULTFD_H */
> diff --git a/migration/ram.c b/migration/ram.c
> index 7811cde643..3adfd1948d 100644
> --- a/migration/ram.c
> +++ b/migration/ram.c
> @@ -56,6 +56,11 @@
>  #include "savevm.h"
>  #include "qemu/iov.h"
>  #include "multifd.h"
> +#include "sysemu/runstate.h"
> +
> +#ifdef CONFIG_LINUX
> +#include "qemu/userfaultfd.h"
> +#endif
>  
>  /***********************************************************/
>  /* ram save/restore */
> @@ -298,6 +303,8 @@ struct RAMSrcPageRequest {
>  struct RAMState {
>      /* QEMUFile used for this migration */
>      QEMUFile *f;
> +    /* UFFD file descriptor, used in 'write-tracking' migration */
> +    int uffdio_fd;
>      /* Last block that we have visited searching for dirty pages */
>      RAMBlock *last_seen_block;
>      /* Last block from where we have sent data */
> @@ -3788,6 +3795,119 @@ static int ram_resume_prepare(MigrationState *s, void *opaque)
>      return 0;
>  }
>  
> +/*
> + * ram_write_tracking_start: start UFFD-WP memory tracking
> + *
> + * Returns 0 for success or negative value in case of error
> + *

(extra new line)

> + */
> +int ram_write_tracking_start(void)
> +{
> +#ifdef CONFIG_LINUX
> +    int uffd;
> +    RAMState *rs = ram_state;
> +    RAMBlock *bs;
> +
> +    /* Open UFFD file descriptor */
> +    uffd = uffd_create_fd();
> +    if (uffd < 0) {
> +        return uffd;
> +    }
> +    rs->uffdio_fd = uffd;

May need a rcu_read_lock() here to guarantee safe access to
RAMBLOCK_FOREACH_NOT_IGNORED.

> +
> +    RAMBLOCK_FOREACH_NOT_IGNORED(bs) {
> +        /* Nothing to do with read-only and MMIO-writable regions */
> +        if (bs->mr->readonly || bs->mr->rom_device) {
> +            continue;
> +        }
> +
> +        bs->flags |= RAM_UF_WRITEPROTECT;
> +        /* Register block memory with UFFD to track writes */
> +        if (uffd_register_memory(rs->uffdio_fd, (hwaddr) bs->host,
> +                bs->max_length, false, true)) {
> +            goto fail;
> +        }
> +        /* Apply UFFD write protection to the block memory range */
> +        if (uffd_protect_memory(rs->uffdio_fd, (hwaddr) bs->host,
> +                bs->max_length, true)) {
> +            goto fail;
> +        }
> +
> +        info_report("UFFD-WP write-tracking enabled: "
> +                "block_id=%s page_size=%zu start=%p length=%lu "
> +                "romd_mode=%i ram=%i readonly=%i nonvolatile=%i rom_device=%i",
> +                bs->idstr, bs->page_size, bs->host, bs->max_length,
> +                bs->mr->romd_mode, bs->mr->ram, bs->mr->readonly,
> +                bs->mr->nonvolatile, bs->mr->rom_device);

Is info_report() by default printed?  Change this into a tracepoint?  Please
refer to functions named with trace_*().

> +    }
> +
> +    return 0;
> +
> +fail:
> +    error_report("ram_write_tracking_start() failed: restoring initial memory state");
> +
> +    RAMBLOCK_FOREACH_NOT_IGNORED(bs) {
> +        if ((bs->flags & RAM_UF_WRITEPROTECT) == 0) {
> +            continue;
> +        }
> +        /*
> +         * In case some memory block failed to be write-protected
> +         * remove protection and unregister all succeeded RAM blocks
> +         */
> +        uffd_protect_memory(rs->uffdio_fd, (hwaddr) bs->host, bs->max_length, false);
> +        uffd_unregister_memory(rs->uffdio_fd, (hwaddr) bs->host, bs->max_length);
> +        /* Cleanup flags */
> +        bs->flags &= ~RAM_UF_WRITEPROTECT;
> +    }
> +
> +    uffd_close_fd(uffd);
> +    rs->uffdio_fd = -1;
> +    return -1;
> +#else
> +    rs->uffdio_fd = -1;
> +    error_setg(&migrate_get_current()->error,
> +            "Background-snapshot not supported on non-Linux hosts");

Accessing the global var seems an overkill to me.  I'd simply return an error,
since iiuc if we gate the capability bit well, then these paths won't really
trigger at all.  So it's just to pass the compilations.

> +    return -1;
> +#endif /* CONFIG_LINUX */
> +}
> +
> +/**
> + * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
> + */
> +void ram_write_tracking_stop(void)
> +{
> +#ifdef CONFIG_LINUX
> +    RAMState *rs = ram_state;
> +    RAMBlock *bs;
> +    assert(rs->uffdio_fd >= 0);

Maybe too harsh - we can return if it's invalid.

Meanwhile, better rcu_read_lock(), as well?

> +
> +    RAMBLOCK_FOREACH_NOT_IGNORED(bs) {
> +        if ((bs->flags & RAM_UF_WRITEPROTECT) == 0) {
> +            continue;
> +        }
> +        /* Remove protection and unregister all affected RAM blocks */
> +        uffd_protect_memory(rs->uffdio_fd, (hwaddr) bs->host, bs->max_length, false);
> +        uffd_unregister_memory(rs->uffdio_fd, (hwaddr) bs->host, bs->max_length);
> +        /* Cleanup flags */
> +        bs->flags &= ~RAM_UF_WRITEPROTECT;
> +
> +        info_report("UFFD-WP write-tracking disabled: "
> +                "block_id=%s page_size=%zu start=%p length=%lu "
> +                "romd_mode=%i ram=%i readonly=%i nonvolatile=%i rom_device=%i",
> +                bs->idstr, bs->page_size, bs->host, bs->max_length,
> +                bs->mr->romd_mode, bs->mr->ram, bs->mr->readonly,
> +                bs->mr->nonvolatile, bs->mr->rom_device);

Use tracepoint?

> +    }
> +
> +    /* Finally close UFFD file descriptor */
> +    uffd_close_fd(rs->uffdio_fd);
> +    rs->uffdio_fd = -1;
> +#else
> +    error_setg(&migrate_get_current()->error,
> +            "Background-snapshot not supported on non-Linux hosts");

Ditto.

> +#endif /* CONFIG_LINUX */
> +}
> +
>  static SaveVMHandlers savevm_ram_handlers = {
>      .save_setup = ram_save_setup,
>      .save_live_iterate = ram_save_iterate,
> diff --git a/migration/ram.h b/migration/ram.h
> index 011e85414e..0ec63e27ee 100644
> --- a/migration/ram.h
> +++ b/migration/ram.h
> @@ -79,4 +79,8 @@ void colo_flush_ram_cache(void);
>  void colo_release_ram_cache(void);
>  void colo_incoming_start_dirty_log(void);
>  
> +/* Background snapshots */
> +int ram_write_tracking_start(void);
> +void ram_write_tracking_stop(void);
> +
>  #endif
> diff --git a/util/meson.build b/util/meson.build
> index f359af0d46..c64bfe94b3 100644
> --- a/util/meson.build
> +++ b/util/meson.build
> @@ -50,6 +50,7 @@ endif
>  
>  if have_system
>    util_ss.add(when: 'CONFIG_GIO', if_true: [files('dbus.c'), gio])
> +  util_ss.add(when: 'CONFIG_LINUX', if_true: files('userfaultfd.c'))
>  endif
>  
>  if have_block
> diff --git a/util/userfaultfd.c b/util/userfaultfd.c
> new file mode 100644
> index 0000000000..038953d7ed
> --- /dev/null
> +++ b/util/userfaultfd.c
> @@ -0,0 +1,215 @@
> +/*
> + * Linux UFFD-WP support
> + *
> + * Copyright Virtuozzo GmbH, 2020
> + *
> + * Authors:
> + *  Andrey Gruzdev   <andrey.gruzdev@virtuozzo.com>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or
> + * later.  See the COPYING file in the top-level directory.
> + */
> +
> +#include "qemu/osdep.h"
> +#include "qemu/bitops.h"
> +#include "qemu/error-report.h"
> +#include "qemu/userfaultfd.h"
> +#include <poll.h>
> +#include <sys/syscall.h>
> +#include <sys/ioctl.h>
> +
> +/**
> + * uffd_create_fd: create UFFD file descriptor
> + *
> + * Returns non-negative file descriptor or negative value in case of an error
> + */
> +int uffd_create_fd(void)
> +{
> +    int uffd;
> +    struct uffdio_api api_struct;
> +    uint64_t ioctl_mask = BIT(_UFFDIO_REGISTER) | BIT(_UFFDIO_UNREGISTER);
> +
> +    uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
> +    if (uffd < 0) {
> +        error_report("uffd_create_fd() failed: UFFD not supported");
> +        return -1;
> +    }
> +
> +    api_struct.api = UFFD_API;
> +    api_struct.features = UFFD_FEATURE_PAGEFAULT_FLAG_WP;

This might be too strict if we want to reuse the codes with postcopy to run on
old kernels that only supports missing.  Not a big problem for now; we can work
on top.

> +    if (ioctl(uffd, UFFDIO_API, &api_struct)) {
> +        error_report("uffd_create_fd() failed: "
> +                     "API version not supported version=%llx errno=%i",
> +                api_struct.api, errno);

(A few strange alignments... won't complain but fixing would be even better)

> +        goto fail;
> +    }
> +
> +    if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) {
> +        error_report("uffd_create_fd() failed: "
> +                     "PAGEFAULT_FLAG_WP feature missing");
> +        goto fail;
> +    }
> +
> +    return uffd;
> +
> +fail:
> +    close(uffd);
> +    return -1;
> +}
> +
> +/**
> + * uffd_close_fd: close UFFD file descriptor
> + *
> + * @uffd: UFFD file descriptor
> + */
> +void uffd_close_fd(int uffd)
> +{
> +    assert(uffd >= 0);
> +    close(uffd);
> +}
> +
> +/**
> + * uffd_register_memory: register memory range with UFFD
> + *
> + * Returns 0 in case of success, negative value on error
> + *
> + * @uffd: UFFD file descriptor
> + * @start: starting virtual address of memory range
> + * @length: length of memory range
> + * @track_missing: generate events on missing-page faults
> + * @track_wp: generate events on write-protected-page faults
> + */
> +int uffd_register_memory(int uffd, hwaddr start, hwaddr length,

My understanding is that hwaddr is for guest physical address space.  Here
I think either uint64_t or "void *" may suite better.

> +        bool track_missing, bool track_wp)
> +{
> +    struct uffdio_register uffd_register;
> +
> +    uffd_register.range.start = start;
> +    uffd_register.range.len = length;
> +    uffd_register.mode = (track_missing ? UFFDIO_REGISTER_MODE_MISSING : 0) |
> +                         (track_wp ? UFFDIO_REGISTER_MODE_WP : 0);
> +
> +    if (ioctl(uffd, UFFDIO_REGISTER, &uffd_register)) {
> +        error_report("uffd_register_memory() failed: "
> +                     "start=%0"PRIx64" len=%"PRIu64" mode=%llu errno=%i",
> +                start, length, uffd_register.mode, errno);
> +        return -1;
> +    }
> +
> +    return 0;
> +}
> +
> +/**
> + * uffd_unregister_memory: un-register memory range with UFFD
> + *
> + * Returns 0 in case of success, negative value on error
> + *
> + * @uffd: UFFD file descriptor
> + * @start: starting virtual address of memory range
> + * @length: length of memory range
> + */
> +int uffd_unregister_memory(int uffd, hwaddr start, hwaddr length)

Same question on using hwaddr as above.

> +{
> +    struct uffdio_range uffd_range;
> +
> +    uffd_range.start = start;
> +    uffd_range.len = length;
> +
> +    if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_range)) {
> +        error_report("uffd_unregister_memory() failed: "
> +                     "start=%0"PRIx64" len=%"PRIu64" errno=%i",
> +                start, length, errno);
> +        return -1;
> +    }
> +
> +    return 0;
> +}
> +
> +/**
> + * uffd_protect_memory: protect/unprotect memory range for writes with UFFD

It may look strange if uffd_protect_memory() can also unprotect stuff...

Maybe rename it to uffd_change_protection()?

> + *
> + * Returns 0 on success or negative value in case of error
> + *
> + * @uffd: UFFD file descriptor
> + * @start: starting virtual address of memory range
> + * @length: length of memory range
> + * @wp: write-protect/unprotect
> + */
> +int uffd_protect_memory(int uffd, hwaddr start, hwaddr length, bool wp)
> +{
> +    struct uffdio_writeprotect uffd_writeprotect;
> +    int res;
> +
> +    uffd_writeprotect.range.start = start;
> +    uffd_writeprotect.range.len = length;
> +    uffd_writeprotect.mode = (wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0);
> +
> +    do {
> +        res = ioctl(uffd, UFFDIO_WRITEPROTECT, &uffd_writeprotect);
> +    } while (res < 0 && errno == EINTR);
> +    if (res < 0) {
> +        error_report("uffd_protect_memory() failed: "
> +                     "start=%0"PRIx64" len=%"PRIu64" mode=%llu errno=%i",
> +                start, length, uffd_writeprotect.mode, errno);
> +        return -1;
> +    }
> +
> +    return 0;
> +}
> +
> +/**
> + * uffd_read_events: read pending UFFD events
> + *
> + * Returns number of fetched messages, 0 if non is available or
> + * negative value in case of an error
> + *
> + * @uffd: UFFD file descriptor
> + * @msgs: pointer to message buffer
> + * @count: number of messages that can fit in the buffer
> + */
> +int uffd_read_events(int uffd, struct uffd_msg *msgs, int count)
> +{
> +    ssize_t res;
> +    do {
> +        res = read(uffd, msgs, count * sizeof(struct uffd_msg));
> +    } while (res < 0 && errno == EINTR);
> +
> +    if ((res < 0 && errno == EAGAIN)) {
> +        return 0;
> +    }
> +    if (res < 0) {
> +        error_report("uffd_read_events() failed: errno=%i", errno);
> +        return -1;
> +    }
> +
> +    return (int) (res / sizeof(struct uffd_msg));
> +}
> +
> +/**
> + * uffd_poll_events: poll UFFD file descriptor for read
> + *
> + * Returns true if events are available for read, false otherwise
> + *
> + * @uffd: UFFD file descriptor
> + * @tmo: timeout in milliseconds, 0 for non-blocking operation,
> + *       negative value for infinite wait
> + */
> +bool uffd_poll_events(int uffd, int tmo)

Shall we spell "tmo" out?

Thanks,

> +{
> +    int res;
> +    struct pollfd poll_fd = { .fd = uffd, .events = POLLIN, .revents = 0 };
> +
> +    do {
> +        res = poll(&poll_fd, 1, tmo);
> +    } while (res < 0 && errno == EINTR);
> +
> +    if (res == 0) {
> +        return false;
> +    }
> +    if (res < 0) {
> +        error_report("uffd_poll_events() failed: errno=%i", errno);
> +        return false;
> +    }
> +
> +    return (poll_fd.revents & POLLIN) != 0;
> +}
> -- 
> 2.25.1
>
Andrey Gruzdev Nov. 29, 2020, 8:12 p.m. UTC | #2
On 28.11.2020 00:04, Peter Xu wrote:
> On Thu, Nov 26, 2020 at 06:17:30PM +0300, Andrey Gruzdev wrote:
>> Implemented support for the whole RAM block memory
>> protection/un-protection. Introduced higher level
>> ram_write_tracking_start() and ram_write_tracking_stop()
>> to start/stop tracking guest memory writes.
> 
> The whole patch looks good to me in general.  A few nitpickings below..
> 
>>
>> Signed-off-by: Andrey Gruzdev <andrey.gruzdev@virtuozzo.com>
>> ---
>>   include/exec/memory.h      |   7 ++
>>   include/qemu/userfaultfd.h |  29 +++++
>>   migration/ram.c            | 120 +++++++++++++++++++++
>>   migration/ram.h            |   4 +
>>   util/meson.build           |   1 +
>>   util/userfaultfd.c         | 215 +++++++++++++++++++++++++++++++++++++
>>   6 files changed, 376 insertions(+)
>>   create mode 100644 include/qemu/userfaultfd.h
>>   create mode 100644 util/userfaultfd.c
>>
>> diff --git a/include/exec/memory.h b/include/exec/memory.h
>> index 0f3e6bcd5e..3d798fce16 100644
>> --- a/include/exec/memory.h
>> +++ b/include/exec/memory.h
>> @@ -139,6 +139,13 @@ typedef struct IOMMUNotifier IOMMUNotifier;
>>   /* RAM is a persistent kind memory */
>>   #define RAM_PMEM (1 << 5)
>>   
>> +/*
>> + * UFFDIO_WRITEPROTECT is used on this RAMBlock to
>> + * support 'write-tracking' migration type.
>> + * Implies ram_state->ram_wt_enabled.
>> + */
>> +#define RAM_UF_WRITEPROTECT (1 << 6)
>> +
>>   static inline void iommu_notifier_init(IOMMUNotifier *n, IOMMUNotify fn,
>>                                          IOMMUNotifierFlag flags,
>>                                          hwaddr start, hwaddr end,
>> diff --git a/include/qemu/userfaultfd.h b/include/qemu/userfaultfd.h
>> new file mode 100644
>> index 0000000000..fb843c76db
>> --- /dev/null
>> +++ b/include/qemu/userfaultfd.h
>> @@ -0,0 +1,29 @@
>> +/*
>> + * Linux UFFD-WP support
>> + *
>> + * Copyright Virtuozzo GmbH, 2020
>> + *
>> + * Authors:
>> + *  Andrey Gruzdev   <andrey.gruzdev@virtuozzo.com>
>> + *
>> + * This work is licensed under the terms of the GNU GPL, version 2 or
>> + * later.  See the COPYING file in the top-level directory.
>> + */
>> +
>> +#ifndef USERFAULTFD_H
>> +#define USERFAULTFD_H
>> +
>> +#include "qemu/osdep.h"
>> +#include "exec/hwaddr.h"
>> +#include <linux/userfaultfd.h>
>> +
>> +int uffd_create_fd(void);
>> +void uffd_close_fd(int uffd);
>> +int uffd_register_memory(int uffd, hwaddr start, hwaddr length,
>> +        bool track_missing, bool track_wp);
>> +int uffd_unregister_memory(int uffd, hwaddr start, hwaddr length);
>> +int uffd_protect_memory(int uffd, hwaddr start, hwaddr length, bool wp);
>> +int uffd_read_events(int uffd, struct uffd_msg *msgs, int count);
>> +bool uffd_poll_events(int uffd, int tmo);
>> +
>> +#endif /* USERFAULTFD_H */
>> diff --git a/migration/ram.c b/migration/ram.c
>> index 7811cde643..3adfd1948d 100644
>> --- a/migration/ram.c
>> +++ b/migration/ram.c
>> @@ -56,6 +56,11 @@
>>   #include "savevm.h"
>>   #include "qemu/iov.h"
>>   #include "multifd.h"
>> +#include "sysemu/runstate.h"
>> +
>> +#ifdef CONFIG_LINUX
>> +#include "qemu/userfaultfd.h"
>> +#endif
>>   
>>   /***********************************************************/
>>   /* ram save/restore */
>> @@ -298,6 +303,8 @@ struct RAMSrcPageRequest {
>>   struct RAMState {
>>       /* QEMUFile used for this migration */
>>       QEMUFile *f;
>> +    /* UFFD file descriptor, used in 'write-tracking' migration */
>> +    int uffdio_fd;
>>       /* Last block that we have visited searching for dirty pages */
>>       RAMBlock *last_seen_block;
>>       /* Last block from where we have sent data */
>> @@ -3788,6 +3795,119 @@ static int ram_resume_prepare(MigrationState *s, void *opaque)
>>       return 0;
>>   }
>>   
>> +/*
>> + * ram_write_tracking_start: start UFFD-WP memory tracking
>> + *
>> + * Returns 0 for success or negative value in case of error
>> + *
> 
> (extra new line)
> 

Ok, see.

>> + */
>> +int ram_write_tracking_start(void)
>> +{
>> +#ifdef CONFIG_LINUX
>> +    int uffd;
>> +    RAMState *rs = ram_state;
>> +    RAMBlock *bs;
>> +
>> +    /* Open UFFD file descriptor */
>> +    uffd = uffd_create_fd();
>> +    if (uffd < 0) {
>> +        return uffd;
>> +    }
>> +    rs->uffdio_fd = uffd;
> 
> May need a rcu_read_lock() here to guarantee safe access to
> RAMBLOCK_FOREACH_NOT_IGNORED.
> 

Yeah, really better to add RCU read lock here.

>> +
>> +    RAMBLOCK_FOREACH_NOT_IGNORED(bs) {
>> +        /* Nothing to do with read-only and MMIO-writable regions */
>> +        if (bs->mr->readonly || bs->mr->rom_device) {
>> +            continue;
>> +        }
>> +
>> +        bs->flags |= RAM_UF_WRITEPROTECT;
>> +        /* Register block memory with UFFD to track writes */
>> +        if (uffd_register_memory(rs->uffdio_fd, (hwaddr) bs->host,
>> +                bs->max_length, false, true)) {
>> +            goto fail;
>> +        }
>> +        /* Apply UFFD write protection to the block memory range */
>> +        if (uffd_protect_memory(rs->uffdio_fd, (hwaddr) bs->host,
>> +                bs->max_length, true)) {
>> +            goto fail;
>> +        }
>> +
>> +        info_report("UFFD-WP write-tracking enabled: "
>> +                "block_id=%s page_size=%zu start=%p length=%lu "
>> +                "romd_mode=%i ram=%i readonly=%i nonvolatile=%i rom_device=%i",
>> +                bs->idstr, bs->page_size, bs->host, bs->max_length,
>> +                bs->mr->romd_mode, bs->mr->ram, bs->mr->readonly,
>> +                bs->mr->nonvolatile, bs->mr->rom_device);
> 
> Is info_report() by default printed?  Change this into a tracepoint?  Please
> refer to functions named with trace_*().
> 

Agree, I'll change to tracepoints.

>> +    }
>> +
>> +    return 0;
>> +
>> +fail:
>> +    error_report("ram_write_tracking_start() failed: restoring initial memory state");
>> +
>> +    RAMBLOCK_FOREACH_NOT_IGNORED(bs) {
>> +        if ((bs->flags & RAM_UF_WRITEPROTECT) == 0) {
>> +            continue;
>> +        }
>> +        /*
>> +         * In case some memory block failed to be write-protected
>> +         * remove protection and unregister all succeeded RAM blocks
>> +         */
>> +        uffd_protect_memory(rs->uffdio_fd, (hwaddr) bs->host, bs->max_length, false);
>> +        uffd_unregister_memory(rs->uffdio_fd, (hwaddr) bs->host, bs->max_length);
>> +        /* Cleanup flags */
>> +        bs->flags &= ~RAM_UF_WRITEPROTECT;
>> +    }
>> +
>> +    uffd_close_fd(uffd);
>> +    rs->uffdio_fd = -1;
>> +    return -1;
>> +#else
>> +    rs->uffdio_fd = -1;
>> +    error_setg(&migrate_get_current()->error,
>> +            "Background-snapshot not supported on non-Linux hosts");
> 
> Accessing the global var seems an overkill to me.  I'd simply return an error,
> since iiuc if we gate the capability bit well, then these paths won't really
> trigger at all.  So it's just to pass the compilations.
> 

IMHO it's not too terrible, but really not needed, since won't be ever 
triggered. I'll remove error_setg() call.

>> +    return -1;
>> +#endif /* CONFIG_LINUX */
>> +}
>> +
>> +/**
>> + * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
>> + */
>> +void ram_write_tracking_stop(void)
>> +{
>> +#ifdef CONFIG_LINUX
>> +    RAMState *rs = ram_state;
>> +    RAMBlock *bs;
>> +    assert(rs->uffdio_fd >= 0);
> 
> Maybe too harsh - we can return if it's invalid.
> 
> Meanwhile, better rcu_read_lock(), as well?
> 

Yep, RCU lock, I'll add. Why too harsh? Just a debug assertion.
>> +
>> +    RAMBLOCK_FOREACH_NOT_IGNORED(bs) {
>> +        if ((bs->flags & RAM_UF_WRITEPROTECT) == 0) {
>> +            continue;
>> +        }
>> +        /* Remove protection and unregister all affected RAM blocks */
>> +        uffd_protect_memory(rs->uffdio_fd, (hwaddr) bs->host, bs->max_length, false);
>> +        uffd_unregister_memory(rs->uffdio_fd, (hwaddr) bs->host, bs->max_length);
>> +        /* Cleanup flags */
>> +        bs->flags &= ~RAM_UF_WRITEPROTECT;
>> +
>> +        info_report("UFFD-WP write-tracking disabled: "
>> +                "block_id=%s page_size=%zu start=%p length=%lu "
>> +                "romd_mode=%i ram=%i readonly=%i nonvolatile=%i rom_device=%i",
>> +                bs->idstr, bs->page_size, bs->host, bs->max_length,
>> +                bs->mr->romd_mode, bs->mr->ram, bs->mr->readonly,
>> +                bs->mr->nonvolatile, bs->mr->rom_device);
> 
> Use tracepoint?
> 

Yep.

>> +    }
>> +
>> +    /* Finally close UFFD file descriptor */
>> +    uffd_close_fd(rs->uffdio_fd);
>> +    rs->uffdio_fd = -1;
>> +#else
>> +    error_setg(&migrate_get_current()->error,
>> +            "Background-snapshot not supported on non-Linux hosts");
> 
> Ditto.
> 

Aha, the same.

>> +#endif /* CONFIG_LINUX */
>> +}
>> +
>>   static SaveVMHandlers savevm_ram_handlers = {
>>       .save_setup = ram_save_setup,
>>       .save_live_iterate = ram_save_iterate,
>> diff --git a/migration/ram.h b/migration/ram.h
>> index 011e85414e..0ec63e27ee 100644
>> --- a/migration/ram.h
>> +++ b/migration/ram.h
>> @@ -79,4 +79,8 @@ void colo_flush_ram_cache(void);
>>   void colo_release_ram_cache(void);
>>   void colo_incoming_start_dirty_log(void);
>>   
>> +/* Background snapshots */
>> +int ram_write_tracking_start(void);
>> +void ram_write_tracking_stop(void);
>> +
>>   #endif
>> diff --git a/util/meson.build b/util/meson.build
>> index f359af0d46..c64bfe94b3 100644
>> --- a/util/meson.build
>> +++ b/util/meson.build
>> @@ -50,6 +50,7 @@ endif
>>   
>>   if have_system
>>     util_ss.add(when: 'CONFIG_GIO', if_true: [files('dbus.c'), gio])
>> +  util_ss.add(when: 'CONFIG_LINUX', if_true: files('userfaultfd.c'))
>>   endif
>>   
>>   if have_block
>> diff --git a/util/userfaultfd.c b/util/userfaultfd.c
>> new file mode 100644
>> index 0000000000..038953d7ed
>> --- /dev/null
>> +++ b/util/userfaultfd.c
>> @@ -0,0 +1,215 @@
>> +/*
>> + * Linux UFFD-WP support
>> + *
>> + * Copyright Virtuozzo GmbH, 2020
>> + *
>> + * Authors:
>> + *  Andrey Gruzdev   <andrey.gruzdev@virtuozzo.com>
>> + *
>> + * This work is licensed under the terms of the GNU GPL, version 2 or
>> + * later.  See the COPYING file in the top-level directory.
>> + */
>> +
>> +#include "qemu/osdep.h"
>> +#include "qemu/bitops.h"
>> +#include "qemu/error-report.h"
>> +#include "qemu/userfaultfd.h"
>> +#include <poll.h>
>> +#include <sys/syscall.h>
>> +#include <sys/ioctl.h>
>> +
>> +/**
>> + * uffd_create_fd: create UFFD file descriptor
>> + *
>> + * Returns non-negative file descriptor or negative value in case of an error
>> + */
>> +int uffd_create_fd(void)
>> +{
>> +    int uffd;
>> +    struct uffdio_api api_struct;
>> +    uint64_t ioctl_mask = BIT(_UFFDIO_REGISTER) | BIT(_UFFDIO_UNREGISTER);
>> +
>> +    uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
>> +    if (uffd < 0) {
>> +        error_report("uffd_create_fd() failed: UFFD not supported");
>> +        return -1;
>> +    }
>> +
>> +    api_struct.api = UFFD_API;
>> +    api_struct.features = UFFD_FEATURE_PAGEFAULT_FLAG_WP;
> 
> This might be too strict if we want to reuse the codes with postcopy to run on
> old kernels that only supports missing.  Not a big problem for now; we can work
> on top.
> 

Aha, ok.

>> +    if (ioctl(uffd, UFFDIO_API, &api_struct)) {
>> +        error_report("uffd_create_fd() failed: "
>> +                     "API version not supported version=%llx errno=%i",
>> +                api_struct.api, errno);
> 
> (A few strange alignments... won't complain but fixing would be even better)
> 

Mm, yes!

>> +        goto fail;
>> +    }
>> +
>> +    if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) {
>> +        error_report("uffd_create_fd() failed: "
>> +                     "PAGEFAULT_FLAG_WP feature missing");
>> +        goto fail;
>> +    }
>> +
>> +    return uffd;
>> +
>> +fail:
>> +    close(uffd);
>> +    return -1;
>> +}
>> +
>> +/**
>> + * uffd_close_fd: close UFFD file descriptor
>> + *
>> + * @uffd: UFFD file descriptor
>> + */
>> +void uffd_close_fd(int uffd)
>> +{
>> +    assert(uffd >= 0);
>> +    close(uffd);
>> +}
>> +
>> +/**
>> + * uffd_register_memory: register memory range with UFFD
>> + *
>> + * Returns 0 in case of success, negative value on error
>> + *
>> + * @uffd: UFFD file descriptor
>> + * @start: starting virtual address of memory range
>> + * @length: length of memory range
>> + * @track_missing: generate events on missing-page faults
>> + * @track_wp: generate events on write-protected-page faults
>> + */
>> +int uffd_register_memory(int uffd, hwaddr start, hwaddr length,
> 
> My understanding is that hwaddr is for guest physical address space.  Here
> I think either uint64_t or "void *" may suite better.
> 

Agree, uint64_t will be better here.

>> +        bool track_missing, bool track_wp)
>> +{
>> +    struct uffdio_register uffd_register;
>> +
>> +    uffd_register.range.start = start;
>> +    uffd_register.range.len = length;
>> +    uffd_register.mode = (track_missing ? UFFDIO_REGISTER_MODE_MISSING : 0) |
>> +                         (track_wp ? UFFDIO_REGISTER_MODE_WP : 0);
>> +
>> +    if (ioctl(uffd, UFFDIO_REGISTER, &uffd_register)) {
>> +        error_report("uffd_register_memory() failed: "
>> +                     "start=%0"PRIx64" len=%"PRIu64" mode=%llu errno=%i",
>> +                start, length, uffd_register.mode, errno);
>> +        return -1;
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +/**
>> + * uffd_unregister_memory: un-register memory range with UFFD
>> + *
>> + * Returns 0 in case of success, negative value on error
>> + *
>> + * @uffd: UFFD file descriptor
>> + * @start: starting virtual address of memory range
>> + * @length: length of memory range
>> + */
>> +int uffd_unregister_memory(int uffd, hwaddr start, hwaddr length)
> 
> Same question on using hwaddr as above.
> 

Aha.

>> +{
>> +    struct uffdio_range uffd_range;
>> +
>> +    uffd_range.start = start;
>> +    uffd_range.len = length;
>> +
>> +    if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_range)) {
>> +        error_report("uffd_unregister_memory() failed: "
>> +                     "start=%0"PRIx64" len=%"PRIu64" errno=%i",
>> +                start, length, errno);
>> +        return -1;
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +/**
>> + * uffd_protect_memory: protect/unprotect memory range for writes with UFFD
> 
> It may look strange if uffd_protect_memory() can also unprotect stuff...
> 
> Maybe rename it to uffd_change_protection()?
> 

Yes, this name is better.

>> + *
>> + * Returns 0 on success or negative value in case of error
>> + *
>> + * @uffd: UFFD file descriptor
>> + * @start: starting virtual address of memory range
>> + * @length: length of memory range
>> + * @wp: write-protect/unprotect
>> + */
>> +int uffd_protect_memory(int uffd, hwaddr start, hwaddr length, bool wp)
>> +{
>> +    struct uffdio_writeprotect uffd_writeprotect;
>> +    int res;
>> +
>> +    uffd_writeprotect.range.start = start;
>> +    uffd_writeprotect.range.len = length;
>> +    uffd_writeprotect.mode = (wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0);
>> +
>> +    do {
>> +        res = ioctl(uffd, UFFDIO_WRITEPROTECT, &uffd_writeprotect);
>> +    } while (res < 0 && errno == EINTR);
>> +    if (res < 0) {
>> +        error_report("uffd_protect_memory() failed: "
>> +                     "start=%0"PRIx64" len=%"PRIu64" mode=%llu errno=%i",
>> +                start, length, uffd_writeprotect.mode, errno);
>> +        return -1;
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +/**
>> + * uffd_read_events: read pending UFFD events
>> + *
>> + * Returns number of fetched messages, 0 if non is available or
>> + * negative value in case of an error
>> + *
>> + * @uffd: UFFD file descriptor
>> + * @msgs: pointer to message buffer
>> + * @count: number of messages that can fit in the buffer
>> + */
>> +int uffd_read_events(int uffd, struct uffd_msg *msgs, int count)
>> +{
>> +    ssize_t res;
>> +    do {
>> +        res = read(uffd, msgs, count * sizeof(struct uffd_msg));
>> +    } while (res < 0 && errno == EINTR);
>> +
>> +    if ((res < 0 && errno == EAGAIN)) {
>> +        return 0;
>> +    }
>> +    if (res < 0) {
>> +        error_report("uffd_read_events() failed: errno=%i", errno);
>> +        return -1;
>> +    }
>> +
>> +    return (int) (res / sizeof(struct uffd_msg));
>> +}
>> +
>> +/**
>> + * uffd_poll_events: poll UFFD file descriptor for read
>> + *
>> + * Returns true if events are available for read, false otherwise
>> + *
>> + * @uffd: UFFD file descriptor
>> + * @tmo: timeout in milliseconds, 0 for non-blocking operation,
>> + *       negative value for infinite wait
>> + */
>> +bool uffd_poll_events(int uffd, int tmo)
> 
> Shall we spell "tmo" out?
> 
> Thanks,
> 

In the comment? I think it's ok.

>> +{
>> +    int res;
>> +    struct pollfd poll_fd = { .fd = uffd, .events = POLLIN, .revents = 0 };
>> +
>> +    do {
>> +        res = poll(&poll_fd, 1, tmo);
>> +    } while (res < 0 && errno == EINTR);
>> +
>> +    if (res == 0) {
>> +        return false;
>> +    }
>> +    if (res < 0) {
>> +        error_report("uffd_poll_events() failed: errno=%i", errno);
>> +        return false;
>> +    }
>> +
>> +    return (poll_fd.revents & POLLIN) != 0;
>> +}
>> -- 
>> 2.25.1
>>
>
Peter Xu Nov. 30, 2020, 3:34 p.m. UTC | #3
On Sun, Nov 29, 2020 at 11:12:10PM +0300, Andrey Gruzdev wrote:
> > > +void ram_write_tracking_stop(void)
> > > +{
> > > +#ifdef CONFIG_LINUX
> > > +    RAMState *rs = ram_state;
> > > +    RAMBlock *bs;
> > > +    assert(rs->uffdio_fd >= 0);
> > 
> > Maybe too harsh - we can return if it's invalid.
> > 
> > Meanwhile, better rcu_read_lock(), as well?
> > 
> 
> Yep, RCU lock, I'll add. Why too harsh? Just a debug assertion.

I was afraid some special path could trigger ram_write_tracking_stop() being
called before ram_write_tracking_start(), then vm could crash.  If we can
guarantee that not happening, then it's also ok with assert().

[...]

> > > +/**
> > > + * uffd_poll_events: poll UFFD file descriptor for read
> > > + *
> > > + * Returns true if events are available for read, false otherwise
> > > + *
> > > + * @uffd: UFFD file descriptor
> > > + * @tmo: timeout in milliseconds, 0 for non-blocking operation,
> > > + *       negative value for infinite wait
> > > + */
> > > +bool uffd_poll_events(int uffd, int tmo)
> > 
> > Shall we spell "tmo" out?
> In the comment? I think it's ok.

I'd suggest to spell it out everywhere, especially in the codes.  But feel free
to take your own preference.  Thanks,
Andrey Gruzdev Nov. 30, 2020, 6:41 p.m. UTC | #4
On 30.11.2020 18:34, Peter Xu wrote:
> On Sun, Nov 29, 2020 at 11:12:10PM +0300, Andrey Gruzdev wrote:
>>>> +void ram_write_tracking_stop(void)
>>>> +{
>>>> +#ifdef CONFIG_LINUX
>>>> +    RAMState *rs = ram_state;
>>>> +    RAMBlock *bs;
>>>> +    assert(rs->uffdio_fd >= 0);
>>>
>>> Maybe too harsh - we can return if it's invalid.
>>>
>>> Meanwhile, better rcu_read_lock(), as well?
>>>
>>
>> Yep, RCU lock, I'll add. Why too harsh? Just a debug assertion.
> 
> I was afraid some special path could trigger ram_write_tracking_stop() being
> called before ram_write_tracking_start(), then vm could crash.  If we can
> guarantee that not happening, then it's also ok with assert().
> 
> [...]
> 

It can't really happen, but I agree that assert() is a bit excessive here.

>>>> +/**
>>>> + * uffd_poll_events: poll UFFD file descriptor for read
>>>> + *
>>>> + * Returns true if events are available for read, false otherwise
>>>> + *
>>>> + * @uffd: UFFD file descriptor
>>>> + * @tmo: timeout in milliseconds, 0 for non-blocking operation,
>>>> + *       negative value for infinite wait
>>>> + */
>>>> +bool uffd_poll_events(int uffd, int tmo)
>>>
>>> Shall we spell "tmo" out?
>> In the comment? I think it's ok.
> 
> I'd suggest to spell it out everywhere, especially in the codes.  But feel free
> to take your own preference.  Thanks,
> 

Thanks,
Dr. David Alan Gilbert Dec. 1, 2020, 12:24 p.m. UTC | #5
* Andrey Gruzdev (andrey.gruzdev@virtuozzo.com) wrote:
> Implemented support for the whole RAM block memory
> protection/un-protection. Introduced higher level
> ram_write_tracking_start() and ram_write_tracking_stop()
> to start/stop tracking guest memory writes.
> 
> Signed-off-by: Andrey Gruzdev <andrey.gruzdev@virtuozzo.com>
> ---
>  include/exec/memory.h      |   7 ++
>  include/qemu/userfaultfd.h |  29 +++++
>  migration/ram.c            | 120 +++++++++++++++++++++
>  migration/ram.h            |   4 +
>  util/meson.build           |   1 +
>  util/userfaultfd.c         | 215 +++++++++++++++++++++++++++++++++++++
>  6 files changed, 376 insertions(+)
>  create mode 100644 include/qemu/userfaultfd.h
>  create mode 100644 util/userfaultfd.c
> 
> diff --git a/include/exec/memory.h b/include/exec/memory.h
> index 0f3e6bcd5e..3d798fce16 100644
> --- a/include/exec/memory.h
> +++ b/include/exec/memory.h
> @@ -139,6 +139,13 @@ typedef struct IOMMUNotifier IOMMUNotifier;
>  /* RAM is a persistent kind memory */
>  #define RAM_PMEM (1 << 5)
>  
> +/*
> + * UFFDIO_WRITEPROTECT is used on this RAMBlock to
> + * support 'write-tracking' migration type.
> + * Implies ram_state->ram_wt_enabled.
> + */
> +#define RAM_UF_WRITEPROTECT (1 << 6)
> +
>  static inline void iommu_notifier_init(IOMMUNotifier *n, IOMMUNotify fn,
>                                         IOMMUNotifierFlag flags,
>                                         hwaddr start, hwaddr end,
> diff --git a/include/qemu/userfaultfd.h b/include/qemu/userfaultfd.h
> new file mode 100644
> index 0000000000..fb843c76db
> --- /dev/null
> +++ b/include/qemu/userfaultfd.h
> @@ -0,0 +1,29 @@
> +/*
> + * Linux UFFD-WP support
> + *
> + * Copyright Virtuozzo GmbH, 2020
> + *
> + * Authors:
> + *  Andrey Gruzdev   <andrey.gruzdev@virtuozzo.com>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or
> + * later.  See the COPYING file in the top-level directory.
> + */
> +
> +#ifndef USERFAULTFD_H
> +#define USERFAULTFD_H
> +
> +#include "qemu/osdep.h"
> +#include "exec/hwaddr.h"
> +#include <linux/userfaultfd.h>
> +
> +int uffd_create_fd(void);
> +void uffd_close_fd(int uffd);
> +int uffd_register_memory(int uffd, hwaddr start, hwaddr length,
> +        bool track_missing, bool track_wp);
> +int uffd_unregister_memory(int uffd, hwaddr start, hwaddr length);
> +int uffd_protect_memory(int uffd, hwaddr start, hwaddr length, bool wp);
> +int uffd_read_events(int uffd, struct uffd_msg *msgs, int count);
> +bool uffd_poll_events(int uffd, int tmo);
> +
> +#endif /* USERFAULTFD_H */
> diff --git a/migration/ram.c b/migration/ram.c
> index 7811cde643..3adfd1948d 100644
> --- a/migration/ram.c
> +++ b/migration/ram.c
> @@ -56,6 +56,11 @@
>  #include "savevm.h"
>  #include "qemu/iov.h"
>  #include "multifd.h"
> +#include "sysemu/runstate.h"
> +
> +#ifdef CONFIG_LINUX
> +#include "qemu/userfaultfd.h"
> +#endif
>  
>  /***********************************************************/
>  /* ram save/restore */
> @@ -298,6 +303,8 @@ struct RAMSrcPageRequest {
>  struct RAMState {
>      /* QEMUFile used for this migration */
>      QEMUFile *f;
> +    /* UFFD file descriptor, used in 'write-tracking' migration */
> +    int uffdio_fd;
>      /* Last block that we have visited searching for dirty pages */
>      RAMBlock *last_seen_block;
>      /* Last block from where we have sent data */
> @@ -3788,6 +3795,119 @@ static int ram_resume_prepare(MigrationState *s, void *opaque)
>      return 0;
>  }
>  
> +/*
> + * ram_write_tracking_start: start UFFD-WP memory tracking
> + *
> + * Returns 0 for success or negative value in case of error
> + *
> + */
> +int ram_write_tracking_start(void)

Please split this stuff off into a separate patch; just make this one
the userfaultfd.[ch] and then a separate one for gluing it into ram.c

> +{
> +#ifdef CONFIG_LINUX
> +    int uffd;
> +    RAMState *rs = ram_state;
> +    RAMBlock *bs;
> +
> +    /* Open UFFD file descriptor */
> +    uffd = uffd_create_fd();
> +    if (uffd < 0) {
> +        return uffd;
> +    }
> +    rs->uffdio_fd = uffd;
> +
> +    RAMBLOCK_FOREACH_NOT_IGNORED(bs) {
> +        /* Nothing to do with read-only and MMIO-writable regions */
> +        if (bs->mr->readonly || bs->mr->rom_device) {
> +            continue;
> +        }
> +
> +        bs->flags |= RAM_UF_WRITEPROTECT;
> +        /* Register block memory with UFFD to track writes */
> +        if (uffd_register_memory(rs->uffdio_fd, (hwaddr) bs->host,
> +                bs->max_length, false, true)) {
> +            goto fail;
> +        }
> +        /* Apply UFFD write protection to the block memory range */
> +        if (uffd_protect_memory(rs->uffdio_fd, (hwaddr) bs->host,
> +                bs->max_length, true)) {
> +            goto fail;
> +        }
> +
> +        info_report("UFFD-WP write-tracking enabled: "
> +                "block_id=%s page_size=%zu start=%p length=%lu "
> +                "romd_mode=%i ram=%i readonly=%i nonvolatile=%i rom_device=%i",
> +                bs->idstr, bs->page_size, bs->host, bs->max_length,
> +                bs->mr->romd_mode, bs->mr->ram, bs->mr->readonly,
> +                bs->mr->nonvolatile, bs->mr->rom_device);
> +    }
> +
> +    return 0;
> +
> +fail:
> +    error_report("ram_write_tracking_start() failed: restoring initial memory state");
> +
> +    RAMBLOCK_FOREACH_NOT_IGNORED(bs) {
> +        if ((bs->flags & RAM_UF_WRITEPROTECT) == 0) {
> +            continue;
> +        }
> +        /*
> +         * In case some memory block failed to be write-protected
> +         * remove protection and unregister all succeeded RAM blocks
> +         */
> +        uffd_protect_memory(rs->uffdio_fd, (hwaddr) bs->host, bs->max_length, false);
> +        uffd_unregister_memory(rs->uffdio_fd, (hwaddr) bs->host, bs->max_length);
> +        /* Cleanup flags */
> +        bs->flags &= ~RAM_UF_WRITEPROTECT;
> +    }
> +
> +    uffd_close_fd(uffd);
> +    rs->uffdio_fd = -1;
> +    return -1;
> +#else
> +    rs->uffdio_fd = -1;
> +    error_setg(&migrate_get_current()->error,
> +            "Background-snapshot not supported on non-Linux hosts");
> +    return -1;
> +#endif /* CONFIG_LINUX */
> +}
> +
> +/**
> + * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
> + */
> +void ram_write_tracking_stop(void)
> +{
> +#ifdef CONFIG_LINUX
> +    RAMState *rs = ram_state;
> +    RAMBlock *bs;
> +    assert(rs->uffdio_fd >= 0);
> +
> +    RAMBLOCK_FOREACH_NOT_IGNORED(bs) {
> +        if ((bs->flags & RAM_UF_WRITEPROTECT) == 0) {
> +            continue;
> +        }
> +        /* Remove protection and unregister all affected RAM blocks */
> +        uffd_protect_memory(rs->uffdio_fd, (hwaddr) bs->host, bs->max_length, false);
> +        uffd_unregister_memory(rs->uffdio_fd, (hwaddr) bs->host, bs->max_length);
> +        /* Cleanup flags */
> +        bs->flags &= ~RAM_UF_WRITEPROTECT;
> +
> +        info_report("UFFD-WP write-tracking disabled: "
> +                "block_id=%s page_size=%zu start=%p length=%lu "
> +                "romd_mode=%i ram=%i readonly=%i nonvolatile=%i rom_device=%i",
> +                bs->idstr, bs->page_size, bs->host, bs->max_length,
> +                bs->mr->romd_mode, bs->mr->ram, bs->mr->readonly,
> +                bs->mr->nonvolatile, bs->mr->rom_device);
> +    }
> +
> +    /* Finally close UFFD file descriptor */
> +    uffd_close_fd(rs->uffdio_fd);
> +    rs->uffdio_fd = -1;
> +#else
> +    error_setg(&migrate_get_current()->error,
> +            "Background-snapshot not supported on non-Linux hosts");
> +#endif /* CONFIG_LINUX */
> +}
> +
>  static SaveVMHandlers savevm_ram_handlers = {
>      .save_setup = ram_save_setup,
>      .save_live_iterate = ram_save_iterate,
> diff --git a/migration/ram.h b/migration/ram.h
> index 011e85414e..0ec63e27ee 100644
> --- a/migration/ram.h
> +++ b/migration/ram.h
> @@ -79,4 +79,8 @@ void colo_flush_ram_cache(void);
>  void colo_release_ram_cache(void);
>  void colo_incoming_start_dirty_log(void);
>  
> +/* Background snapshots */
> +int ram_write_tracking_start(void);
> +void ram_write_tracking_stop(void);
> +
>  #endif
> diff --git a/util/meson.build b/util/meson.build
> index f359af0d46..c64bfe94b3 100644
> --- a/util/meson.build
> +++ b/util/meson.build
> @@ -50,6 +50,7 @@ endif
>  
>  if have_system
>    util_ss.add(when: 'CONFIG_GIO', if_true: [files('dbus.c'), gio])
> +  util_ss.add(when: 'CONFIG_LINUX', if_true: files('userfaultfd.c'))
>  endif
>  
>  if have_block
> diff --git a/util/userfaultfd.c b/util/userfaultfd.c
> new file mode 100644
> index 0000000000..038953d7ed
> --- /dev/null
> +++ b/util/userfaultfd.c
> @@ -0,0 +1,215 @@
> +/*
> + * Linux UFFD-WP support
> + *
> + * Copyright Virtuozzo GmbH, 2020
> + *
> + * Authors:
> + *  Andrey Gruzdev   <andrey.gruzdev@virtuozzo.com>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or
> + * later.  See the COPYING file in the top-level directory.
> + */
> +
> +#include "qemu/osdep.h"
> +#include "qemu/bitops.h"
> +#include "qemu/error-report.h"
> +#include "qemu/userfaultfd.h"
> +#include <poll.h>
> +#include <sys/syscall.h>
> +#include <sys/ioctl.h>
> +
> +/**
> + * uffd_create_fd: create UFFD file descriptor
> + *
> + * Returns non-negative file descriptor or negative value in case of an error
> + */
> +int uffd_create_fd(void)
> +{
> +    int uffd;
> +    struct uffdio_api api_struct;
> +    uint64_t ioctl_mask = BIT(_UFFDIO_REGISTER) | BIT(_UFFDIO_UNREGISTER);
> +
> +    uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
> +    if (uffd < 0) {
> +        error_report("uffd_create_fd() failed: UFFD not supported");

Please include the errno (or strerror(errno)) 

> +        return -1;
> +    }
> +
> +    api_struct.api = UFFD_API;
> +    api_struct.features = UFFD_FEATURE_PAGEFAULT_FLAG_WP;
> +    if (ioctl(uffd, UFFDIO_API, &api_struct)) {
> +        error_report("uffd_create_fd() failed: "
> +                     "API version not supported version=%llx errno=%i",
> +                api_struct.api, errno);
> +        goto fail;
> +    }
> +
> +    if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) {
> +        error_report("uffd_create_fd() failed: "
> +                     "PAGEFAULT_FLAG_WP feature missing");
> +        goto fail;
> +    }
> +
> +    return uffd;
> +
> +fail:
> +    close(uffd);
> +    return -1;
> +}

It would be great if we could find a way to share this with
migration/postcopy-ram.c - this duplicates a lor of
request_ufd_features.

> +
> +/**
> + * uffd_close_fd: close UFFD file descriptor
> + *
> + * @uffd: UFFD file descriptor
> + */
> +void uffd_close_fd(int uffd)
> +{
> +    assert(uffd >= 0);
> +    close(uffd);
> +}
> +
> +/**
> + * uffd_register_memory: register memory range with UFFD
> + *
> + * Returns 0 in case of success, negative value on error
> + *
> + * @uffd: UFFD file descriptor
> + * @start: starting virtual address of memory range
> + * @length: length of memory range
> + * @track_missing: generate events on missing-page faults
> + * @track_wp: generate events on write-protected-page faults
> + */
> +int uffd_register_memory(int uffd, hwaddr start, hwaddr length,
> +        bool track_missing, bool track_wp)
> +{
> +    struct uffdio_register uffd_register;
> +
> +    uffd_register.range.start = start;
> +    uffd_register.range.len = length;
> +    uffd_register.mode = (track_missing ? UFFDIO_REGISTER_MODE_MISSING : 0) |
> +                         (track_wp ? UFFDIO_REGISTER_MODE_WP : 0);
> +
> +    if (ioctl(uffd, UFFDIO_REGISTER, &uffd_register)) {
> +        error_report("uffd_register_memory() failed: "
> +                     "start=%0"PRIx64" len=%"PRIu64" mode=%llu errno=%i",

PRIx64 for mode as well.

> +                start, length, uffd_register.mode, errno);
> +        return -1;
> +    }
> +
> +    return 0;
> +}
> +
> +/**
> + * uffd_unregister_memory: un-register memory range with UFFD
> + *
> + * Returns 0 in case of success, negative value on error
> + *
> + * @uffd: UFFD file descriptor
> + * @start: starting virtual address of memory range
> + * @length: length of memory range
> + */
> +int uffd_unregister_memory(int uffd, hwaddr start, hwaddr length)
> +{
> +    struct uffdio_range uffd_range;
> +
> +    uffd_range.start = start;
> +    uffd_range.len = length;
> +
> +    if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_range)) {
> +        error_report("uffd_unregister_memory() failed: "
> +                     "start=%0"PRIx64" len=%"PRIu64" errno=%i",
> +                start, length, errno);
> +        return -1;
> +    }
> +
> +    return 0;
> +}
> +
> +/**
> + * uffd_protect_memory: protect/unprotect memory range for writes with UFFD
> + *
> + * Returns 0 on success or negative value in case of error
> + *
> + * @uffd: UFFD file descriptor
> + * @start: starting virtual address of memory range
> + * @length: length of memory range
> + * @wp: write-protect/unprotect
> + */
> +int uffd_protect_memory(int uffd, hwaddr start, hwaddr length, bool wp)
> +{
> +    struct uffdio_writeprotect uffd_writeprotect;
> +    int res;
> +
> +    uffd_writeprotect.range.start = start;
> +    uffd_writeprotect.range.len = length;
> +    uffd_writeprotect.mode = (wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0);
> +
> +    do {
> +        res = ioctl(uffd, UFFDIO_WRITEPROTECT, &uffd_writeprotect);
> +    } while (res < 0 && errno == EINTR);
> +    if (res < 0) {
> +        error_report("uffd_protect_memory() failed: "
> +                     "start=%0"PRIx64" len=%"PRIu64" mode=%llu errno=%i",
> +                start, length, uffd_writeprotect.mode, errno);
> +        return -1;
> +    }
> +
> +    return 0;
> +}
> +
> +/**
> + * uffd_read_events: read pending UFFD events
> + *
> + * Returns number of fetched messages, 0 if non is available or
> + * negative value in case of an error
> + *
> + * @uffd: UFFD file descriptor
> + * @msgs: pointer to message buffer
> + * @count: number of messages that can fit in the buffer
> + */
> +int uffd_read_events(int uffd, struct uffd_msg *msgs, int count)
> +{
> +    ssize_t res;
> +    do {
> +        res = read(uffd, msgs, count * sizeof(struct uffd_msg));
> +    } while (res < 0 && errno == EINTR);
> +
> +    if ((res < 0 && errno == EAGAIN)) {
> +        return 0;
> +    }
> +    if (res < 0) {
> +        error_report("uffd_read_events() failed: errno=%i", errno);
> +        return -1;
> +    }
> +
> +    return (int) (res / sizeof(struct uffd_msg));
> +}
> +
> +/**
> + * uffd_poll_events: poll UFFD file descriptor for read
> + *
> + * Returns true if events are available for read, false otherwise
> + *
> + * @uffd: UFFD file descriptor
> + * @tmo: timeout in milliseconds, 0 for non-blocking operation,
> + *       negative value for infinite wait
> + */
> +bool uffd_poll_events(int uffd, int tmo)
> +{
> +    int res;
> +    struct pollfd poll_fd = { .fd = uffd, .events = POLLIN, .revents = 0 };
> +
> +    do {
> +        res = poll(&poll_fd, 1, tmo);
> +    } while (res < 0 && errno == EINTR);
> +
> +    if (res == 0) {
> +        return false;
> +    }
> +    if (res < 0) {
> +        error_report("uffd_poll_events() failed: errno=%i", errno);
> +        return false;
> +    }
> +
> +    return (poll_fd.revents & POLLIN) != 0;
> +}
> -- 
> 2.25.1
>
Andrey Gruzdev Dec. 1, 2020, 7:32 p.m. UTC | #6
On 01.12.2020 15:24, Dr. David Alan Gilbert wrote:
> * Andrey Gruzdev (andrey.gruzdev@virtuozzo.com) wrote:
>> Implemented support for the whole RAM block memory
>> protection/un-protection. Introduced higher level
>> ram_write_tracking_start() and ram_write_tracking_stop()
>> to start/stop tracking guest memory writes.
>>
>> Signed-off-by: Andrey Gruzdev <andrey.gruzdev@virtuozzo.com>
>> ---
>>   include/exec/memory.h      |   7 ++
>>   include/qemu/userfaultfd.h |  29 +++++
>>   migration/ram.c            | 120 +++++++++++++++++++++
>>   migration/ram.h            |   4 +
>>   util/meson.build           |   1 +
>>   util/userfaultfd.c         | 215 +++++++++++++++++++++++++++++++++++++
>>   6 files changed, 376 insertions(+)
>>   create mode 100644 include/qemu/userfaultfd.h
>>   create mode 100644 util/userfaultfd.c
>>
>> diff --git a/include/exec/memory.h b/include/exec/memory.h
>> index 0f3e6bcd5e..3d798fce16 100644
>> --- a/include/exec/memory.h
>> +++ b/include/exec/memory.h
>> @@ -139,6 +139,13 @@ typedef struct IOMMUNotifier IOMMUNotifier;
>>   /* RAM is a persistent kind memory */
>>   #define RAM_PMEM (1 << 5)
>>   
>> +/*
>> + * UFFDIO_WRITEPROTECT is used on this RAMBlock to
>> + * support 'write-tracking' migration type.
>> + * Implies ram_state->ram_wt_enabled.
>> + */
>> +#define RAM_UF_WRITEPROTECT (1 << 6)
>> +
>>   static inline void iommu_notifier_init(IOMMUNotifier *n, IOMMUNotify fn,
>>                                          IOMMUNotifierFlag flags,
>>                                          hwaddr start, hwaddr end,
>> diff --git a/include/qemu/userfaultfd.h b/include/qemu/userfaultfd.h
>> new file mode 100644
>> index 0000000000..fb843c76db
>> --- /dev/null
>> +++ b/include/qemu/userfaultfd.h
>> @@ -0,0 +1,29 @@
>> +/*
>> + * Linux UFFD-WP support
>> + *
>> + * Copyright Virtuozzo GmbH, 2020
>> + *
>> + * Authors:
>> + *  Andrey Gruzdev   <andrey.gruzdev@virtuozzo.com>
>> + *
>> + * This work is licensed under the terms of the GNU GPL, version 2 or
>> + * later.  See the COPYING file in the top-level directory.
>> + */
>> +
>> +#ifndef USERFAULTFD_H
>> +#define USERFAULTFD_H
>> +
>> +#include "qemu/osdep.h"
>> +#include "exec/hwaddr.h"
>> +#include <linux/userfaultfd.h>
>> +
>> +int uffd_create_fd(void);
>> +void uffd_close_fd(int uffd);
>> +int uffd_register_memory(int uffd, hwaddr start, hwaddr length,
>> +        bool track_missing, bool track_wp);
>> +int uffd_unregister_memory(int uffd, hwaddr start, hwaddr length);
>> +int uffd_protect_memory(int uffd, hwaddr start, hwaddr length, bool wp);
>> +int uffd_read_events(int uffd, struct uffd_msg *msgs, int count);
>> +bool uffd_poll_events(int uffd, int tmo);
>> +
>> +#endif /* USERFAULTFD_H */
>> diff --git a/migration/ram.c b/migration/ram.c
>> index 7811cde643..3adfd1948d 100644
>> --- a/migration/ram.c
>> +++ b/migration/ram.c
>> @@ -56,6 +56,11 @@
>>   #include "savevm.h"
>>   #include "qemu/iov.h"
>>   #include "multifd.h"
>> +#include "sysemu/runstate.h"
>> +
>> +#ifdef CONFIG_LINUX
>> +#include "qemu/userfaultfd.h"
>> +#endif
>>   
>>   /***********************************************************/
>>   /* ram save/restore */
>> @@ -298,6 +303,8 @@ struct RAMSrcPageRequest {
>>   struct RAMState {
>>       /* QEMUFile used for this migration */
>>       QEMUFile *f;
>> +    /* UFFD file descriptor, used in 'write-tracking' migration */
>> +    int uffdio_fd;
>>       /* Last block that we have visited searching for dirty pages */
>>       RAMBlock *last_seen_block;
>>       /* Last block from where we have sent data */
>> @@ -3788,6 +3795,119 @@ static int ram_resume_prepare(MigrationState *s, void *opaque)
>>       return 0;
>>   }
>>   
>> +/*
>> + * ram_write_tracking_start: start UFFD-WP memory tracking
>> + *
>> + * Returns 0 for success or negative value in case of error
>> + *
>> + */
>> +int ram_write_tracking_start(void)
> 
> Please split this stuff off into a separate patch; just make this one
> the userfaultfd.[ch] and then a separate one for gluing it into ram.c
> 

Yep, I'm separating it now - userfaultdf.c and ram_write_tracking stuff, 
they go to different patches.

>> +{
>> +#ifdef CONFIG_LINUX
>> +    int uffd;
>> +    RAMState *rs = ram_state;
>> +    RAMBlock *bs;
>> +
>> +    /* Open UFFD file descriptor */
>> +    uffd = uffd_create_fd();
>> +    if (uffd < 0) {
>> +        return uffd;
>> +    }
>> +    rs->uffdio_fd = uffd;
>> +
>> +    RAMBLOCK_FOREACH_NOT_IGNORED(bs) {
>> +        /* Nothing to do with read-only and MMIO-writable regions */
>> +        if (bs->mr->readonly || bs->mr->rom_device) {
>> +            continue;
>> +        }
>> +
>> +        bs->flags |= RAM_UF_WRITEPROTECT;
>> +        /* Register block memory with UFFD to track writes */
>> +        if (uffd_register_memory(rs->uffdio_fd, (hwaddr) bs->host,
>> +                bs->max_length, false, true)) {
>> +            goto fail;
>> +        }
>> +        /* Apply UFFD write protection to the block memory range */
>> +        if (uffd_protect_memory(rs->uffdio_fd, (hwaddr) bs->host,
>> +                bs->max_length, true)) {
>> +            goto fail;
>> +        }
>> +
>> +        info_report("UFFD-WP write-tracking enabled: "
>> +                "block_id=%s page_size=%zu start=%p length=%lu "
>> +                "romd_mode=%i ram=%i readonly=%i nonvolatile=%i rom_device=%i",
>> +                bs->idstr, bs->page_size, bs->host, bs->max_length,
>> +                bs->mr->romd_mode, bs->mr->ram, bs->mr->readonly,
>> +                bs->mr->nonvolatile, bs->mr->rom_device);
>> +    }
>> +
>> +    return 0;
>> +
>> +fail:
>> +    error_report("ram_write_tracking_start() failed: restoring initial memory state");
>> +
>> +    RAMBLOCK_FOREACH_NOT_IGNORED(bs) {
>> +        if ((bs->flags & RAM_UF_WRITEPROTECT) == 0) {
>> +            continue;
>> +        }
>> +        /*
>> +         * In case some memory block failed to be write-protected
>> +         * remove protection and unregister all succeeded RAM blocks
>> +         */
>> +        uffd_protect_memory(rs->uffdio_fd, (hwaddr) bs->host, bs->max_length, false);
>> +        uffd_unregister_memory(rs->uffdio_fd, (hwaddr) bs->host, bs->max_length);
>> +        /* Cleanup flags */
>> +        bs->flags &= ~RAM_UF_WRITEPROTECT;
>> +    }
>> +
>> +    uffd_close_fd(uffd);
>> +    rs->uffdio_fd = -1;
>> +    return -1;
>> +#else
>> +    rs->uffdio_fd = -1;
>> +    error_setg(&migrate_get_current()->error,
>> +            "Background-snapshot not supported on non-Linux hosts");
>> +    return -1;
>> +#endif /* CONFIG_LINUX */
>> +}
>> +
>> +/**
>> + * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
>> + */
>> +void ram_write_tracking_stop(void)
>> +{
>> +#ifdef CONFIG_LINUX
>> +    RAMState *rs = ram_state;
>> +    RAMBlock *bs;
>> +    assert(rs->uffdio_fd >= 0);
>> +
>> +    RAMBLOCK_FOREACH_NOT_IGNORED(bs) {
>> +        if ((bs->flags & RAM_UF_WRITEPROTECT) == 0) {
>> +            continue;
>> +        }
>> +        /* Remove protection and unregister all affected RAM blocks */
>> +        uffd_protect_memory(rs->uffdio_fd, (hwaddr) bs->host, bs->max_length, false);
>> +        uffd_unregister_memory(rs->uffdio_fd, (hwaddr) bs->host, bs->max_length);
>> +        /* Cleanup flags */
>> +        bs->flags &= ~RAM_UF_WRITEPROTECT;
>> +
>> +        info_report("UFFD-WP write-tracking disabled: "
>> +                "block_id=%s page_size=%zu start=%p length=%lu "
>> +                "romd_mode=%i ram=%i readonly=%i nonvolatile=%i rom_device=%i",
>> +                bs->idstr, bs->page_size, bs->host, bs->max_length,
>> +                bs->mr->romd_mode, bs->mr->ram, bs->mr->readonly,
>> +                bs->mr->nonvolatile, bs->mr->rom_device);
>> +    }
>> +
>> +    /* Finally close UFFD file descriptor */
>> +    uffd_close_fd(rs->uffdio_fd);
>> +    rs->uffdio_fd = -1;
>> +#else
>> +    error_setg(&migrate_get_current()->error,
>> +            "Background-snapshot not supported on non-Linux hosts");
>> +#endif /* CONFIG_LINUX */
>> +}
>> +
>>   static SaveVMHandlers savevm_ram_handlers = {
>>       .save_setup = ram_save_setup,
>>       .save_live_iterate = ram_save_iterate,
>> diff --git a/migration/ram.h b/migration/ram.h
>> index 011e85414e..0ec63e27ee 100644
>> --- a/migration/ram.h
>> +++ b/migration/ram.h
>> @@ -79,4 +79,8 @@ void colo_flush_ram_cache(void);
>>   void colo_release_ram_cache(void);
>>   void colo_incoming_start_dirty_log(void);
>>   
>> +/* Background snapshots */
>> +int ram_write_tracking_start(void);
>> +void ram_write_tracking_stop(void);
>> +
>>   #endif
>> diff --git a/util/meson.build b/util/meson.build
>> index f359af0d46..c64bfe94b3 100644
>> --- a/util/meson.build
>> +++ b/util/meson.build
>> @@ -50,6 +50,7 @@ endif
>>   
>>   if have_system
>>     util_ss.add(when: 'CONFIG_GIO', if_true: [files('dbus.c'), gio])
>> +  util_ss.add(when: 'CONFIG_LINUX', if_true: files('userfaultfd.c'))
>>   endif
>>   
>>   if have_block
>> diff --git a/util/userfaultfd.c b/util/userfaultfd.c
>> new file mode 100644
>> index 0000000000..038953d7ed
>> --- /dev/null
>> +++ b/util/userfaultfd.c
>> @@ -0,0 +1,215 @@
>> +/*
>> + * Linux UFFD-WP support
>> + *
>> + * Copyright Virtuozzo GmbH, 2020
>> + *
>> + * Authors:
>> + *  Andrey Gruzdev   <andrey.gruzdev@virtuozzo.com>
>> + *
>> + * This work is licensed under the terms of the GNU GPL, version 2 or
>> + * later.  See the COPYING file in the top-level directory.
>> + */
>> +
>> +#include "qemu/osdep.h"
>> +#include "qemu/bitops.h"
>> +#include "qemu/error-report.h"
>> +#include "qemu/userfaultfd.h"
>> +#include <poll.h>
>> +#include <sys/syscall.h>
>> +#include <sys/ioctl.h>
>> +
>> +/**
>> + * uffd_create_fd: create UFFD file descriptor
>> + *
>> + * Returns non-negative file descriptor or negative value in case of an error
>> + */
>> +int uffd_create_fd(void)
>> +{
>> +    int uffd;
>> +    struct uffdio_api api_struct;
>> +    uint64_t ioctl_mask = BIT(_UFFDIO_REGISTER) | BIT(_UFFDIO_UNREGISTER);
>> +
>> +    uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
>> +    if (uffd < 0) {
>> +        error_report("uffd_create_fd() failed: UFFD not supported");
> 
> Please include the errno (or strerror(errno))
> 

Ok.

>> +        return -1;
>> +    }
>> +
>> +    api_struct.api = UFFD_API;
>> +    api_struct.features = UFFD_FEATURE_PAGEFAULT_FLAG_WP;
>> +    if (ioctl(uffd, UFFDIO_API, &api_struct)) {
>> +        error_report("uffd_create_fd() failed: "
>> +                     "API version not supported version=%llx errno=%i",
>> +                api_struct.api, errno);
>> +        goto fail;
>> +    }
>> +
>> +    if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) {
>> +        error_report("uffd_create_fd() failed: "
>> +                     "PAGEFAULT_FLAG_WP feature missing");
>> +        goto fail;
>> +    }
>> +
>> +    return uffd;
>> +
>> +fail:
>> +    close(uffd);
>> +    return -1;
>> +}
> 
> It would be great if we could find a way to share this with
> migration/postcopy-ram.c - this duplicates a lor of
> request_ufd_features.
> 

Now I'm changing userfaultfd.c glue code to support also the postcopy 
requirenments - adding functionality to query UFFD feature support, glue 
for COPY_PAGE/ZERO_PAGE IOCTLs etc.

>> +
>> +/**
>> + * uffd_close_fd: close UFFD file descriptor
>> + *
>> + * @uffd: UFFD file descriptor
>> + */
>> +void uffd_close_fd(int uffd)
>> +{
>> +    assert(uffd >= 0);
>> +    close(uffd);
>> +}
>> +
>> +/**
>> + * uffd_register_memory: register memory range with UFFD
>> + *
>> + * Returns 0 in case of success, negative value on error
>> + *
>> + * @uffd: UFFD file descriptor
>> + * @start: starting virtual address of memory range
>> + * @length: length of memory range
>> + * @track_missing: generate events on missing-page faults
>> + * @track_wp: generate events on write-protected-page faults
>> + */
>> +int uffd_register_memory(int uffd, hwaddr start, hwaddr length,
>> +        bool track_missing, bool track_wp)
>> +{
>> +    struct uffdio_register uffd_register;
>> +
>> +    uffd_register.range.start = start;
>> +    uffd_register.range.len = length;
>> +    uffd_register.mode = (track_missing ? UFFDIO_REGISTER_MODE_MISSING : 0) |
>> +                         (track_wp ? UFFDIO_REGISTER_MODE_WP : 0);
>> +
>> +    if (ioctl(uffd, UFFDIO_REGISTER, &uffd_register)) {
>> +        error_report("uffd_register_memory() failed: "
>> +                     "start=%0"PRIx64" len=%"PRIu64" mode=%llu errno=%i",
> 
> PRIx64 for mode as well.
> 

Ok.

>> +                start, length, uffd_register.mode, errno);
>> +        return -1;
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +/**
>> + * uffd_unregister_memory: un-register memory range with UFFD
>> + *
>> + * Returns 0 in case of success, negative value on error
>> + *
>> + * @uffd: UFFD file descriptor
>> + * @start: starting virtual address of memory range
>> + * @length: length of memory range
>> + */
>> +int uffd_unregister_memory(int uffd, hwaddr start, hwaddr length)
>> +{
>> +    struct uffdio_range uffd_range;
>> +
>> +    uffd_range.start = start;
>> +    uffd_range.len = length;
>> +
>> +    if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_range)) {
>> +        error_report("uffd_unregister_memory() failed: "
>> +                     "start=%0"PRIx64" len=%"PRIu64" errno=%i",
>> +                start, length, errno);
>> +        return -1;
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +/**
>> + * uffd_protect_memory: protect/unprotect memory range for writes with UFFD
>> + *
>> + * Returns 0 on success or negative value in case of error
>> + *
>> + * @uffd: UFFD file descriptor
>> + * @start: starting virtual address of memory range
>> + * @length: length of memory range
>> + * @wp: write-protect/unprotect
>> + */
>> +int uffd_protect_memory(int uffd, hwaddr start, hwaddr length, bool wp)
>> +{
>> +    struct uffdio_writeprotect uffd_writeprotect;
>> +    int res;
>> +
>> +    uffd_writeprotect.range.start = start;
>> +    uffd_writeprotect.range.len = length;
>> +    uffd_writeprotect.mode = (wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0);
>> +
>> +    do {
>> +        res = ioctl(uffd, UFFDIO_WRITEPROTECT, &uffd_writeprotect);
>> +    } while (res < 0 && errno == EINTR);
>> +    if (res < 0) {
>> +        error_report("uffd_protect_memory() failed: "
>> +                     "start=%0"PRIx64" len=%"PRIu64" mode=%llu errno=%i",
>> +                start, length, uffd_writeprotect.mode, errno);
>> +        return -1;
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +/**
>> + * uffd_read_events: read pending UFFD events
>> + *
>> + * Returns number of fetched messages, 0 if non is available or
>> + * negative value in case of an error
>> + *
>> + * @uffd: UFFD file descriptor
>> + * @msgs: pointer to message buffer
>> + * @count: number of messages that can fit in the buffer
>> + */
>> +int uffd_read_events(int uffd, struct uffd_msg *msgs, int count)
>> +{
>> +    ssize_t res;
>> +    do {
>> +        res = read(uffd, msgs, count * sizeof(struct uffd_msg));
>> +    } while (res < 0 && errno == EINTR);
>> +
>> +    if ((res < 0 && errno == EAGAIN)) {
>> +        return 0;
>> +    }
>> +    if (res < 0) {
>> +        error_report("uffd_read_events() failed: errno=%i", errno);
>> +        return -1;
>> +    }
>> +
>> +    return (int) (res / sizeof(struct uffd_msg));
>> +}
>> +
>> +/**
>> + * uffd_poll_events: poll UFFD file descriptor for read
>> + *
>> + * Returns true if events are available for read, false otherwise
>> + *
>> + * @uffd: UFFD file descriptor
>> + * @tmo: timeout in milliseconds, 0 for non-blocking operation,
>> + *       negative value for infinite wait
>> + */
>> +bool uffd_poll_events(int uffd, int tmo)
>> +{
>> +    int res;
>> +    struct pollfd poll_fd = { .fd = uffd, .events = POLLIN, .revents = 0 };
>> +
>> +    do {
>> +        res = poll(&poll_fd, 1, tmo);
>> +    } while (res < 0 && errno == EINTR);
>> +
>> +    if (res == 0) {
>> +        return false;
>> +    }
>> +    if (res < 0) {
>> +        error_report("uffd_poll_events() failed: errno=%i", errno);
>> +        return false;
>> +    }
>> +
>> +    return (poll_fd.revents & POLLIN) != 0;
>> +}
>> -- 
>> 2.25.1
>>
diff mbox series

Patch

diff --git a/include/exec/memory.h b/include/exec/memory.h
index 0f3e6bcd5e..3d798fce16 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -139,6 +139,13 @@  typedef struct IOMMUNotifier IOMMUNotifier;
 /* RAM is a persistent kind memory */
 #define RAM_PMEM (1 << 5)
 
+/*
+ * UFFDIO_WRITEPROTECT is used on this RAMBlock to
+ * support 'write-tracking' migration type.
+ * Implies ram_state->ram_wt_enabled.
+ */
+#define RAM_UF_WRITEPROTECT (1 << 6)
+
 static inline void iommu_notifier_init(IOMMUNotifier *n, IOMMUNotify fn,
                                        IOMMUNotifierFlag flags,
                                        hwaddr start, hwaddr end,
diff --git a/include/qemu/userfaultfd.h b/include/qemu/userfaultfd.h
new file mode 100644
index 0000000000..fb843c76db
--- /dev/null
+++ b/include/qemu/userfaultfd.h
@@ -0,0 +1,29 @@ 
+/*
+ * Linux UFFD-WP support
+ *
+ * Copyright Virtuozzo GmbH, 2020
+ *
+ * Authors:
+ *  Andrey Gruzdev   <andrey.gruzdev@virtuozzo.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later.  See the COPYING file in the top-level directory.
+ */
+
+#ifndef USERFAULTFD_H
+#define USERFAULTFD_H
+
+#include "qemu/osdep.h"
+#include "exec/hwaddr.h"
+#include <linux/userfaultfd.h>
+
+int uffd_create_fd(void);
+void uffd_close_fd(int uffd);
+int uffd_register_memory(int uffd, hwaddr start, hwaddr length,
+        bool track_missing, bool track_wp);
+int uffd_unregister_memory(int uffd, hwaddr start, hwaddr length);
+int uffd_protect_memory(int uffd, hwaddr start, hwaddr length, bool wp);
+int uffd_read_events(int uffd, struct uffd_msg *msgs, int count);
+bool uffd_poll_events(int uffd, int tmo);
+
+#endif /* USERFAULTFD_H */
diff --git a/migration/ram.c b/migration/ram.c
index 7811cde643..3adfd1948d 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -56,6 +56,11 @@ 
 #include "savevm.h"
 #include "qemu/iov.h"
 #include "multifd.h"
+#include "sysemu/runstate.h"
+
+#ifdef CONFIG_LINUX
+#include "qemu/userfaultfd.h"
+#endif
 
 /***********************************************************/
 /* ram save/restore */
@@ -298,6 +303,8 @@  struct RAMSrcPageRequest {
 struct RAMState {
     /* QEMUFile used for this migration */
     QEMUFile *f;
+    /* UFFD file descriptor, used in 'write-tracking' migration */
+    int uffdio_fd;
     /* Last block that we have visited searching for dirty pages */
     RAMBlock *last_seen_block;
     /* Last block from where we have sent data */
@@ -3788,6 +3795,119 @@  static int ram_resume_prepare(MigrationState *s, void *opaque)
     return 0;
 }
 
+/*
+ * ram_write_tracking_start: start UFFD-WP memory tracking
+ *
+ * Returns 0 for success or negative value in case of error
+ *
+ */
+int ram_write_tracking_start(void)
+{
+#ifdef CONFIG_LINUX
+    int uffd;
+    RAMState *rs = ram_state;
+    RAMBlock *bs;
+
+    /* Open UFFD file descriptor */
+    uffd = uffd_create_fd();
+    if (uffd < 0) {
+        return uffd;
+    }
+    rs->uffdio_fd = uffd;
+
+    RAMBLOCK_FOREACH_NOT_IGNORED(bs) {
+        /* Nothing to do with read-only and MMIO-writable regions */
+        if (bs->mr->readonly || bs->mr->rom_device) {
+            continue;
+        }
+
+        bs->flags |= RAM_UF_WRITEPROTECT;
+        /* Register block memory with UFFD to track writes */
+        if (uffd_register_memory(rs->uffdio_fd, (hwaddr) bs->host,
+                bs->max_length, false, true)) {
+            goto fail;
+        }
+        /* Apply UFFD write protection to the block memory range */
+        if (uffd_protect_memory(rs->uffdio_fd, (hwaddr) bs->host,
+                bs->max_length, true)) {
+            goto fail;
+        }
+
+        info_report("UFFD-WP write-tracking enabled: "
+                "block_id=%s page_size=%zu start=%p length=%lu "
+                "romd_mode=%i ram=%i readonly=%i nonvolatile=%i rom_device=%i",
+                bs->idstr, bs->page_size, bs->host, bs->max_length,
+                bs->mr->romd_mode, bs->mr->ram, bs->mr->readonly,
+                bs->mr->nonvolatile, bs->mr->rom_device);
+    }
+
+    return 0;
+
+fail:
+    error_report("ram_write_tracking_start() failed: restoring initial memory state");
+
+    RAMBLOCK_FOREACH_NOT_IGNORED(bs) {
+        if ((bs->flags & RAM_UF_WRITEPROTECT) == 0) {
+            continue;
+        }
+        /*
+         * In case some memory block failed to be write-protected
+         * remove protection and unregister all succeeded RAM blocks
+         */
+        uffd_protect_memory(rs->uffdio_fd, (hwaddr) bs->host, bs->max_length, false);
+        uffd_unregister_memory(rs->uffdio_fd, (hwaddr) bs->host, bs->max_length);
+        /* Cleanup flags */
+        bs->flags &= ~RAM_UF_WRITEPROTECT;
+    }
+
+    uffd_close_fd(uffd);
+    rs->uffdio_fd = -1;
+    return -1;
+#else
+    rs->uffdio_fd = -1;
+    error_setg(&migrate_get_current()->error,
+            "Background-snapshot not supported on non-Linux hosts");
+    return -1;
+#endif /* CONFIG_LINUX */
+}
+
+/**
+ * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
+ */
+void ram_write_tracking_stop(void)
+{
+#ifdef CONFIG_LINUX
+    RAMState *rs = ram_state;
+    RAMBlock *bs;
+    assert(rs->uffdio_fd >= 0);
+
+    RAMBLOCK_FOREACH_NOT_IGNORED(bs) {
+        if ((bs->flags & RAM_UF_WRITEPROTECT) == 0) {
+            continue;
+        }
+        /* Remove protection and unregister all affected RAM blocks */
+        uffd_protect_memory(rs->uffdio_fd, (hwaddr) bs->host, bs->max_length, false);
+        uffd_unregister_memory(rs->uffdio_fd, (hwaddr) bs->host, bs->max_length);
+        /* Cleanup flags */
+        bs->flags &= ~RAM_UF_WRITEPROTECT;
+
+        info_report("UFFD-WP write-tracking disabled: "
+                "block_id=%s page_size=%zu start=%p length=%lu "
+                "romd_mode=%i ram=%i readonly=%i nonvolatile=%i rom_device=%i",
+                bs->idstr, bs->page_size, bs->host, bs->max_length,
+                bs->mr->romd_mode, bs->mr->ram, bs->mr->readonly,
+                bs->mr->nonvolatile, bs->mr->rom_device);
+    }
+
+    /* Finally close UFFD file descriptor */
+    uffd_close_fd(rs->uffdio_fd);
+    rs->uffdio_fd = -1;
+#else
+    error_setg(&migrate_get_current()->error,
+            "Background-snapshot not supported on non-Linux hosts");
+#endif /* CONFIG_LINUX */
+}
+
 static SaveVMHandlers savevm_ram_handlers = {
     .save_setup = ram_save_setup,
     .save_live_iterate = ram_save_iterate,
diff --git a/migration/ram.h b/migration/ram.h
index 011e85414e..0ec63e27ee 100644
--- a/migration/ram.h
+++ b/migration/ram.h
@@ -79,4 +79,8 @@  void colo_flush_ram_cache(void);
 void colo_release_ram_cache(void);
 void colo_incoming_start_dirty_log(void);
 
+/* Background snapshots */
+int ram_write_tracking_start(void);
+void ram_write_tracking_stop(void);
+
 #endif
diff --git a/util/meson.build b/util/meson.build
index f359af0d46..c64bfe94b3 100644
--- a/util/meson.build
+++ b/util/meson.build
@@ -50,6 +50,7 @@  endif
 
 if have_system
   util_ss.add(when: 'CONFIG_GIO', if_true: [files('dbus.c'), gio])
+  util_ss.add(when: 'CONFIG_LINUX', if_true: files('userfaultfd.c'))
 endif
 
 if have_block
diff --git a/util/userfaultfd.c b/util/userfaultfd.c
new file mode 100644
index 0000000000..038953d7ed
--- /dev/null
+++ b/util/userfaultfd.c
@@ -0,0 +1,215 @@ 
+/*
+ * Linux UFFD-WP support
+ *
+ * Copyright Virtuozzo GmbH, 2020
+ *
+ * Authors:
+ *  Andrey Gruzdev   <andrey.gruzdev@virtuozzo.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later.  See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/bitops.h"
+#include "qemu/error-report.h"
+#include "qemu/userfaultfd.h"
+#include <poll.h>
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+
+/**
+ * uffd_create_fd: create UFFD file descriptor
+ *
+ * Returns non-negative file descriptor or negative value in case of an error
+ */
+int uffd_create_fd(void)
+{
+    int uffd;
+    struct uffdio_api api_struct;
+    uint64_t ioctl_mask = BIT(_UFFDIO_REGISTER) | BIT(_UFFDIO_UNREGISTER);
+
+    uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
+    if (uffd < 0) {
+        error_report("uffd_create_fd() failed: UFFD not supported");
+        return -1;
+    }
+
+    api_struct.api = UFFD_API;
+    api_struct.features = UFFD_FEATURE_PAGEFAULT_FLAG_WP;
+    if (ioctl(uffd, UFFDIO_API, &api_struct)) {
+        error_report("uffd_create_fd() failed: "
+                     "API version not supported version=%llx errno=%i",
+                api_struct.api, errno);
+        goto fail;
+    }
+
+    if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) {
+        error_report("uffd_create_fd() failed: "
+                     "PAGEFAULT_FLAG_WP feature missing");
+        goto fail;
+    }
+
+    return uffd;
+
+fail:
+    close(uffd);
+    return -1;
+}
+
+/**
+ * uffd_close_fd: close UFFD file descriptor
+ *
+ * @uffd: UFFD file descriptor
+ */
+void uffd_close_fd(int uffd)
+{
+    assert(uffd >= 0);
+    close(uffd);
+}
+
+/**
+ * uffd_register_memory: register memory range with UFFD
+ *
+ * Returns 0 in case of success, negative value on error
+ *
+ * @uffd: UFFD file descriptor
+ * @start: starting virtual address of memory range
+ * @length: length of memory range
+ * @track_missing: generate events on missing-page faults
+ * @track_wp: generate events on write-protected-page faults
+ */
+int uffd_register_memory(int uffd, hwaddr start, hwaddr length,
+        bool track_missing, bool track_wp)
+{
+    struct uffdio_register uffd_register;
+
+    uffd_register.range.start = start;
+    uffd_register.range.len = length;
+    uffd_register.mode = (track_missing ? UFFDIO_REGISTER_MODE_MISSING : 0) |
+                         (track_wp ? UFFDIO_REGISTER_MODE_WP : 0);
+
+    if (ioctl(uffd, UFFDIO_REGISTER, &uffd_register)) {
+        error_report("uffd_register_memory() failed: "
+                     "start=%0"PRIx64" len=%"PRIu64" mode=%llu errno=%i",
+                start, length, uffd_register.mode, errno);
+        return -1;
+    }
+
+    return 0;
+}
+
+/**
+ * uffd_unregister_memory: un-register memory range with UFFD
+ *
+ * Returns 0 in case of success, negative value on error
+ *
+ * @uffd: UFFD file descriptor
+ * @start: starting virtual address of memory range
+ * @length: length of memory range
+ */
+int uffd_unregister_memory(int uffd, hwaddr start, hwaddr length)
+{
+    struct uffdio_range uffd_range;
+
+    uffd_range.start = start;
+    uffd_range.len = length;
+
+    if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_range)) {
+        error_report("uffd_unregister_memory() failed: "
+                     "start=%0"PRIx64" len=%"PRIu64" errno=%i",
+                start, length, errno);
+        return -1;
+    }
+
+    return 0;
+}
+
+/**
+ * uffd_protect_memory: protect/unprotect memory range for writes with UFFD
+ *
+ * Returns 0 on success or negative value in case of error
+ *
+ * @uffd: UFFD file descriptor
+ * @start: starting virtual address of memory range
+ * @length: length of memory range
+ * @wp: write-protect/unprotect
+ */
+int uffd_protect_memory(int uffd, hwaddr start, hwaddr length, bool wp)
+{
+    struct uffdio_writeprotect uffd_writeprotect;
+    int res;
+
+    uffd_writeprotect.range.start = start;
+    uffd_writeprotect.range.len = length;
+    uffd_writeprotect.mode = (wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0);
+
+    do {
+        res = ioctl(uffd, UFFDIO_WRITEPROTECT, &uffd_writeprotect);
+    } while (res < 0 && errno == EINTR);
+    if (res < 0) {
+        error_report("uffd_protect_memory() failed: "
+                     "start=%0"PRIx64" len=%"PRIu64" mode=%llu errno=%i",
+                start, length, uffd_writeprotect.mode, errno);
+        return -1;
+    }
+
+    return 0;
+}
+
+/**
+ * uffd_read_events: read pending UFFD events
+ *
+ * Returns number of fetched messages, 0 if non is available or
+ * negative value in case of an error
+ *
+ * @uffd: UFFD file descriptor
+ * @msgs: pointer to message buffer
+ * @count: number of messages that can fit in the buffer
+ */
+int uffd_read_events(int uffd, struct uffd_msg *msgs, int count)
+{
+    ssize_t res;
+    do {
+        res = read(uffd, msgs, count * sizeof(struct uffd_msg));
+    } while (res < 0 && errno == EINTR);
+
+    if ((res < 0 && errno == EAGAIN)) {
+        return 0;
+    }
+    if (res < 0) {
+        error_report("uffd_read_events() failed: errno=%i", errno);
+        return -1;
+    }
+
+    return (int) (res / sizeof(struct uffd_msg));
+}
+
+/**
+ * uffd_poll_events: poll UFFD file descriptor for read
+ *
+ * Returns true if events are available for read, false otherwise
+ *
+ * @uffd: UFFD file descriptor
+ * @tmo: timeout in milliseconds, 0 for non-blocking operation,
+ *       negative value for infinite wait
+ */
+bool uffd_poll_events(int uffd, int tmo)
+{
+    int res;
+    struct pollfd poll_fd = { .fd = uffd, .events = POLLIN, .revents = 0 };
+
+    do {
+        res = poll(&poll_fd, 1, tmo);
+    } while (res < 0 && errno == EINTR);
+
+    if (res == 0) {
+        return false;
+    }
+    if (res < 0) {
+        error_report("uffd_poll_events() failed: errno=%i", errno);
+        return false;
+    }
+
+    return (poll_fd.revents & POLLIN) != 0;
+}