@@ -18551,6 +18551,13 @@ S: Maintained
F: drivers/ptp/ptp_vclock.c
F: net/ethtool/phc_vclocks.c
+PTP VMCLOCK SUPPORT
+M: David Woodhouse <dwmw2@infradead.org>
+L: netdev@vger.kernel.org
+S: Maintained
+F: drivers/ptp/ptp_vmclock.c
+F: include/uapi/linux/vmclock-abi.h
+
PTRACE SUPPORT
M: Oleg Nesterov <oleg@redhat.com>
S: Maintained
@@ -131,6 +131,19 @@ config PTP_1588_CLOCK_KVM
To compile this driver as a module, choose M here: the module
will be called ptp_kvm.
+config PTP_1588_CLOCK_VMCLOCK
+ tristate "Virtual machine PTP clock"
+ depends on X86_TSC || ARM_ARCH_TIMER
+ depends on PTP_1588_CLOCK && ACPI && ARCH_SUPPORTS_INT128
+ default y
+ help
+ This driver adds support for using a virtual precision clock
+ advertised by the hypervisor. This clock is only useful in virtual
+ machines where such a device is present.
+
+ To compile this driver as a module, choose M here: the module
+ will be called ptp_vmclock.
+
config PTP_1588_CLOCK_IDT82P33
tristate "IDT 82P33xxx PTP clock"
depends on PTP_1588_CLOCK && I2C
@@ -11,6 +11,7 @@ obj-$(CONFIG_PTP_1588_CLOCK_DTE) += ptp_dte.o
obj-$(CONFIG_PTP_1588_CLOCK_INES) += ptp_ines.o
obj-$(CONFIG_PTP_1588_CLOCK_PCH) += ptp_pch.o
obj-$(CONFIG_PTP_1588_CLOCK_KVM) += ptp_kvm.o
+obj-$(CONFIG_PTP_1588_CLOCK_VMCLOCK) += ptp_vmclock.o
obj-$(CONFIG_PTP_1588_CLOCK_QORIQ) += ptp-qoriq.o
ptp-qoriq-y += ptp_qoriq.o
ptp-qoriq-$(CONFIG_DEBUG_FS) += ptp_qoriq_debugfs.o
new file mode 100644
@@ -0,0 +1,615 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Virtual PTP 1588 clock for use with LM-safe VMclock device.
+ *
+ * Copyright © 2024 Amazon.com, Inc. or its affiliates.
+ */
+
+#include <linux/acpi.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/miscdevice.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+
+#include <uapi/linux/vmclock-abi.h>
+
+#include <linux/ptp_clock_kernel.h>
+
+#ifdef CONFIG_X86
+#include <asm/pvclock.h>
+#include <asm/kvmclock.h>
+#endif
+
+#ifdef CONFIG_KVM_GUEST
+#define SUPPORT_KVMCLOCK
+#endif
+
+static DEFINE_IDA(vmclock_ida);
+
+ACPI_MODULE_NAME("vmclock");
+
+struct vmclock_state {
+ struct resource res;
+ struct vmclock_abi *clk;
+ struct miscdevice miscdev;
+ struct ptp_clock_info ptp_clock_info;
+ struct ptp_clock *ptp_clock;
+ enum clocksource_ids cs_id, sys_cs_id;
+ int index;
+ char *name;
+};
+
+#define VMCLOCK_MAX_WAIT ms_to_ktime(100)
+
+/* Require at least the flags field to be present. All else can be optional. */
+#define VMCLOCK_MIN_SIZE offsetof(struct vmclock_abi, pad)
+
+#define VMCLOCK_FIELD_PRESENT(_c, _f) \
+ (le32_to_cpu((_c)->size) >= (offsetof(struct vmclock_abi, _f) + \
+ sizeof((_c)->_f)))
+
+/*
+ * Multiply a 64-bit count by a 64-bit tick 'period' in units of seconds >> 64
+ * and add the fractional second part of the reference time.
+ *
+ * The result is a 128-bit value, the top 64 bits of which are seconds, and
+ * the low 64 bits are (seconds >> 64).
+ */
+static uint64_t mul_u64_u64_shr_add_u64(uint64_t *res_hi, uint64_t delta,
+ uint64_t period, uint8_t shift,
+ uint64_t frac_sec)
+{
+ unsigned __int128 res = (unsigned __int128)delta * period;
+
+ res >>= shift;
+ res += frac_sec;
+ *res_hi = res >> 64;
+ return (uint64_t)res;
+}
+
+static bool tai_adjust(struct vmclock_abi *clk, uint64_t *sec)
+{
+ if (likely(clk->time_type == VMCLOCK_TIME_UTC))
+ return true;
+
+ if (clk->time_type == VMCLOCK_TIME_TAI &&
+ (le64_to_cpu(clk->flags) & VMCLOCK_FLAG_TAI_OFFSET_VALID)) {
+ if (sec)
+ *sec += (int16_t)le16_to_cpu(clk->tai_offset_sec);
+ return true;
+ }
+ return false;
+}
+
+static int vmclock_get_crosststamp(struct vmclock_state *st,
+ struct ptp_system_timestamp *sts,
+ struct system_counterval_t *system_counter,
+ struct timespec64 *tspec)
+{
+ ktime_t deadline = ktime_add(ktime_get(), VMCLOCK_MAX_WAIT);
+ struct system_time_snapshot systime_snapshot;
+ uint64_t cycle, delta, seq, frac_sec;
+
+#ifdef CONFIG_X86
+ /*
+ * We'd expect the hypervisor to know this and to report the clock
+ * status as VMCLOCK_STATUS_UNRELIABLE. But be paranoid.
+ */
+ if (check_tsc_unstable())
+ return -EINVAL;
+#endif
+
+ while (1) {
+ seq = le32_to_cpu(st->clk->seq_count) & ~1ULL;
+
+ /*
+ * This pairs with a write barrier in the hypervisor
+ * which populates this structure.
+ */
+ virt_rmb();
+
+ if (st->clk->clock_status == VMCLOCK_STATUS_UNRELIABLE)
+ return -EINVAL;
+
+ /*
+ * When invoked for gettimex64(), fill in the pre/post system
+ * times. The simple case is when system time is based on the
+ * same counter as st->cs_id, in which case all three times
+ * will be derived from the *same* counter value.
+ *
+ * If the system isn't using the same counter, then the value
+ * from ktime_get_snapshot() will still be used as pre_ts, and
+ * ptp_read_system_postts() is called to populate postts after
+ * calling get_cycles().
+ *
+ * The conversion to timespec64 happens further down, outside
+ * the seq_count loop.
+ */
+ if (sts) {
+ ktime_get_snapshot(&systime_snapshot);
+ if (systime_snapshot.cs_id == st->cs_id) {
+ cycle = systime_snapshot.cycles;
+ } else {
+ cycle = get_cycles();
+ ptp_read_system_postts(sts);
+ }
+ } else {
+ cycle = get_cycles();
+ }
+
+ delta = cycle - le64_to_cpu(st->clk->counter_value);
+
+ frac_sec = mul_u64_u64_shr_add_u64(&tspec->tv_sec, delta,
+ le64_to_cpu(st->clk->counter_period_frac_sec),
+ st->clk->counter_period_shift,
+ le64_to_cpu(st->clk->time_frac_sec));
+ tspec->tv_nsec = mul_u64_u64_shr(frac_sec, NSEC_PER_SEC, 64);
+ tspec->tv_sec += le64_to_cpu(st->clk->time_sec);
+
+ if (!tai_adjust(st->clk, &tspec->tv_sec))
+ return -EINVAL;
+
+ /*
+ * This pairs with a write barrier in the hypervisor
+ * which populates this structure.
+ */
+ virt_rmb();
+ if (seq == le32_to_cpu(st->clk->seq_count))
+ break;
+
+ if (ktime_after(ktime_get(), deadline))
+ return -ETIMEDOUT;
+ }
+
+ if (system_counter) {
+ system_counter->cycles = cycle;
+ system_counter->cs_id = st->cs_id;
+ }
+
+ if (sts) {
+ sts->pre_ts = ktime_to_timespec64(systime_snapshot.real);
+ if (systime_snapshot.cs_id == st->cs_id)
+ sts->post_ts = sts->pre_ts;
+ }
+
+ return 0;
+}
+
+#ifdef SUPPORT_KVMCLOCK
+/*
+ * In the case where the system is using the KVM clock for timekeeping, convert
+ * the TSC value into a KVM clock time in order to return a paired reading that
+ * get_device_system_crosststamp() can cope with.
+ */
+static int vmclock_get_crosststamp_kvmclock(struct vmclock_state *st,
+ struct ptp_system_timestamp *sts,
+ struct system_counterval_t *system_counter,
+ struct timespec64 *tspec)
+{
+ struct pvclock_vcpu_time_info *pvti = this_cpu_pvti();
+ unsigned int pvti_ver;
+ int ret;
+
+ preempt_disable_notrace();
+
+ do {
+ pvti_ver = pvclock_read_begin(pvti);
+
+ ret = vmclock_get_crosststamp(st, sts, system_counter, tspec);
+ if (ret)
+ break;
+
+ system_counter->cycles = __pvclock_read_cycles(pvti,
+ system_counter->cycles);
+ system_counter->cs_id = CSID_X86_KVM_CLK;
+
+ /*
+ * This retry should never really happen; if the TSC is
+ * stable and reliable enough across vCPUS that it is sane
+ * for the hypervisor to expose a VMCLOCK device which uses
+ * it as the reference counter, then the KVM clock sohuld be
+ * in 'master clock mode' and basically never changed. But
+ * the KVM clock is a fickle and often broken thing, so do
+ * it "properly" just in case.
+ */
+ } while (pvclock_read_retry(pvti, pvti_ver));
+
+ preempt_enable_notrace();
+
+ return ret;
+}
+#endif
+
+static int ptp_vmclock_get_time_fn(ktime_t *device_time,
+ struct system_counterval_t *system_counter,
+ void *ctx)
+{
+ struct vmclock_state *st = ctx;
+ struct timespec64 tspec;
+ int ret;
+
+#ifdef SUPPORT_KVMCLOCK
+ if (READ_ONCE(st->sys_cs_id) == CSID_X86_KVM_CLK)
+ ret = vmclock_get_crosststamp_kvmclock(st, NULL, system_counter,
+ &tspec);
+ else
+#endif
+ ret = vmclock_get_crosststamp(st, NULL, system_counter, &tspec);
+
+ if (!ret)
+ *device_time = timespec64_to_ktime(tspec);
+
+ return ret;
+}
+
+static int ptp_vmclock_getcrosststamp(struct ptp_clock_info *ptp,
+ struct system_device_crosststamp *xtstamp)
+{
+ struct vmclock_state *st = container_of(ptp, struct vmclock_state,
+ ptp_clock_info);
+ int ret = get_device_system_crosststamp(ptp_vmclock_get_time_fn, st,
+ NULL, xtstamp);
+#ifdef SUPPORT_KVMCLOCK
+ /*
+ * On x86, the KVM clock may be used for the system time. We can
+ * actually convert a TSC reading to that, and return a paired
+ * timestamp that get_device_system_crosststamp() *can* handle.
+ */
+ if (ret == -ENODEV) {
+ struct system_time_snapshot systime_snapshot;
+
+ ktime_get_snapshot(&systime_snapshot);
+
+ if (systime_snapshot.cs_id == CSID_X86_TSC ||
+ systime_snapshot.cs_id == CSID_X86_KVM_CLK) {
+ WRITE_ONCE(st->sys_cs_id, systime_snapshot.cs_id);
+ ret = get_device_system_crosststamp(ptp_vmclock_get_time_fn,
+ st, NULL, xtstamp);
+ }
+ }
+#endif
+ return ret;
+}
+
+/*
+ * PTP clock operations
+ */
+
+static int ptp_vmclock_adjfine(struct ptp_clock_info *ptp, long delta)
+{
+ return -EOPNOTSUPP;
+}
+
+static int ptp_vmclock_adjtime(struct ptp_clock_info *ptp, s64 delta)
+{
+ return -EOPNOTSUPP;
+}
+
+static int ptp_vmclock_settime(struct ptp_clock_info *ptp,
+ const struct timespec64 *ts)
+{
+ return -EOPNOTSUPP;
+}
+
+static int ptp_vmclock_gettimex(struct ptp_clock_info *ptp, struct timespec64 *ts,
+ struct ptp_system_timestamp *sts)
+{
+ struct vmclock_state *st = container_of(ptp, struct vmclock_state,
+ ptp_clock_info);
+
+ return vmclock_get_crosststamp(st, sts, NULL, ts);
+}
+
+static int ptp_vmclock_enable(struct ptp_clock_info *ptp,
+ struct ptp_clock_request *rq, int on)
+{
+ return -EOPNOTSUPP;
+}
+
+static const struct ptp_clock_info ptp_vmclock_info = {
+ .owner = THIS_MODULE,
+ .max_adj = 0,
+ .n_ext_ts = 0,
+ .n_pins = 0,
+ .pps = 0,
+ .adjfine = ptp_vmclock_adjfine,
+ .adjtime = ptp_vmclock_adjtime,
+ .gettimex64 = ptp_vmclock_gettimex,
+ .settime64 = ptp_vmclock_settime,
+ .enable = ptp_vmclock_enable,
+ .getcrosststamp = ptp_vmclock_getcrosststamp,
+};
+
+static struct ptp_clock *vmclock_ptp_register(struct device *dev,
+ struct vmclock_state *st)
+{
+ enum clocksource_ids cs_id;
+
+ if (IS_ENABLED(CONFIG_ARM64) &&
+ st->clk->counter_id == VMCLOCK_COUNTER_ARM_VCNT) {
+ /* Can we check it's the virtual counter? */
+ cs_id = CSID_ARM_ARCH_COUNTER;
+ } else if (IS_ENABLED(CONFIG_X86) &&
+ st->clk->counter_id == VMCLOCK_COUNTER_X86_TSC) {
+ cs_id = CSID_X86_TSC;
+ } else {
+ return NULL;
+ }
+
+ /* Only UTC, or TAI with offset */
+ if (!tai_adjust(st->clk, NULL)) {
+ dev_info(dev, "vmclock does not provide unambiguous UTC\n");
+ return NULL;
+ }
+
+ st->sys_cs_id = cs_id;
+ st->cs_id = cs_id;
+ st->ptp_clock_info = ptp_vmclock_info;
+ strscpy(st->ptp_clock_info.name, st->name);
+
+ return ptp_clock_register(&st->ptp_clock_info, dev);
+}
+
+static int vmclock_miscdev_mmap(struct file *fp, struct vm_area_struct *vma)
+{
+ struct vmclock_state *st = container_of(fp->private_data,
+ struct vmclock_state, miscdev);
+
+ if ((vma->vm_flags & (VM_READ|VM_WRITE)) != VM_READ)
+ return -EROFS;
+
+ if (vma->vm_end - vma->vm_start != PAGE_SIZE || vma->vm_pgoff)
+ return -EINVAL;
+
+ if (io_remap_pfn_range(vma, vma->vm_start,
+ st->res.start >> PAGE_SHIFT, PAGE_SIZE,
+ vma->vm_page_prot))
+ return -EAGAIN;
+
+ return 0;
+}
+
+static ssize_t vmclock_miscdev_read(struct file *fp, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct vmclock_state *st = container_of(fp->private_data,
+ struct vmclock_state, miscdev);
+ ktime_t deadline = ktime_add(ktime_get(), VMCLOCK_MAX_WAIT);
+ size_t max_count;
+ uint32_t seq;
+
+ if (*ppos >= PAGE_SIZE)
+ return 0;
+
+ max_count = PAGE_SIZE - *ppos;
+ if (count > max_count)
+ count = max_count;
+
+ while (1) {
+ seq = le32_to_cpu(st->clk->seq_count) & ~1U;
+ /* Pairs with hypervisor wmb */
+ virt_rmb();
+
+ if (copy_to_user(buf, ((char *)st->clk) + *ppos, count))
+ return -EFAULT;
+
+ /* Pairs with hypervisor wmb */
+ virt_rmb();
+ if (seq == le32_to_cpu(st->clk->seq_count))
+ break;
+
+ if (ktime_after(ktime_get(), deadline))
+ return -ETIMEDOUT;
+ }
+
+ *ppos += count;
+ return count;
+}
+
+static const struct file_operations vmclock_miscdev_fops = {
+ .mmap = vmclock_miscdev_mmap,
+ .read = vmclock_miscdev_read,
+};
+
+/* module operations */
+
+static void vmclock_remove(struct platform_device *pdev)
+{
+ struct device *dev = &pdev->dev;
+ struct vmclock_state *st = dev_get_drvdata(dev);
+
+ if (st->ptp_clock)
+ ptp_clock_unregister(st->ptp_clock);
+
+ if (st->miscdev.minor != MISC_DYNAMIC_MINOR)
+ misc_deregister(&st->miscdev);
+}
+
+static acpi_status vmclock_acpi_resources(struct acpi_resource *ares, void *data)
+{
+ struct vmclock_state *st = data;
+ struct resource_win win;
+ struct resource *res = &win.res;
+
+ if (ares->type == ACPI_RESOURCE_TYPE_END_TAG)
+ return AE_OK;
+
+ /* There can be only one */
+ if (resource_type(&st->res) == IORESOURCE_MEM)
+ return AE_ERROR;
+
+ if (acpi_dev_resource_memory(ares, res) ||
+ acpi_dev_resource_address_space(ares, &win)) {
+
+ if (resource_type(res) != IORESOURCE_MEM ||
+ resource_size(res) < sizeof(st->clk))
+ return AE_ERROR;
+
+ st->res = *res;
+ return AE_OK;
+ }
+
+ return AE_ERROR;
+}
+
+static int vmclock_probe_acpi(struct device *dev, struct vmclock_state *st)
+{
+ struct acpi_device *adev = ACPI_COMPANION(dev);
+ acpi_status status;
+
+ /*
+ * This should never happen as this function is only called when
+ * has_acpi_companion(dev) is true, but the logic is sufficiently
+ * complex that Coverity can't see the tautology.
+ */
+ if (!adev)
+ return -ENODEV;
+
+ status = acpi_walk_resources(adev->handle, METHOD_NAME__CRS,
+ vmclock_acpi_resources, st);
+ if (ACPI_FAILURE(status) || resource_type(&st->res) != IORESOURCE_MEM) {
+ dev_err(dev, "failed to get resources\n");
+ return -ENODEV;
+ }
+
+ return 0;
+}
+
+static void vmclock_put_idx(void *data)
+{
+ struct vmclock_state *st = data;
+
+ ida_free(&vmclock_ida, st->index);
+}
+
+static int vmclock_probe(struct platform_device *pdev)
+{
+ struct device *dev = &pdev->dev;
+ struct vmclock_state *st;
+ int ret;
+
+ st = devm_kzalloc(dev, sizeof(*st), GFP_KERNEL);
+ if (!st)
+ return -ENOMEM;
+
+ if (has_acpi_companion(dev))
+ ret = vmclock_probe_acpi(dev, st);
+ else
+ ret = -EINVAL; /* Only ACPI for now */
+
+ if (ret) {
+ dev_info(dev, "Failed to obtain physical address: %d\n", ret);
+ goto out;
+ }
+
+ if (resource_size(&st->res) < VMCLOCK_MIN_SIZE) {
+ dev_info(dev, "Region too small (0x%llx)\n",
+ resource_size(&st->res));
+ ret = -EINVAL;
+ goto out;
+ }
+ st->clk = devm_memremap(dev, st->res.start, resource_size(&st->res),
+ MEMREMAP_WB | MEMREMAP_DEC);
+ if (IS_ERR(st->clk)) {
+ ret = PTR_ERR(st->clk);
+ dev_info(dev, "failed to map shared memory\n");
+ st->clk = NULL;
+ goto out;
+ }
+
+ if (le32_to_cpu(st->clk->magic) != VMCLOCK_MAGIC ||
+ le32_to_cpu(st->clk->size) > resource_size(&st->res) ||
+ le16_to_cpu(st->clk->version) != 1) {
+ dev_info(dev, "vmclock magic fields invalid\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = ida_alloc(&vmclock_ida, GFP_KERNEL);
+ if (ret < 0)
+ goto out;
+
+ st->index = ret;
+ ret = devm_add_action_or_reset(&pdev->dev, vmclock_put_idx, st);
+ if (ret)
+ goto out;
+
+ st->name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "vmclock%d", st->index);
+ if (!st->name) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /*
+ * If the structure is big enough, it can be mapped to userspace.
+ * Theoretically a guest OS even using larger pages could still
+ * use 4KiB PTEs to map smaller MMIO regions like this, but let's
+ * cross that bridge if/when we come to it.
+ */
+ if (le32_to_cpu(st->clk->size) >= PAGE_SIZE) {
+ st->miscdev.minor = MISC_DYNAMIC_MINOR;
+ st->miscdev.fops = &vmclock_miscdev_fops;
+ st->miscdev.name = st->name;
+
+ ret = misc_register(&st->miscdev);
+ if (ret)
+ goto out;
+ }
+
+ /* If there is valid clock information, register a PTP clock */
+ if (VMCLOCK_FIELD_PRESENT(st->clk, time_frac_sec)) {
+ /* Can return a silent NULL, or an error. */
+ st->ptp_clock = vmclock_ptp_register(dev, st);
+ if (IS_ERR(st->ptp_clock)) {
+ ret = PTR_ERR(st->ptp_clock);
+ st->ptp_clock = NULL;
+ vmclock_remove(pdev);
+ goto out;
+ }
+ }
+
+ if (!st->miscdev.minor && !st->ptp_clock) {
+ /* Neither miscdev nor PTP registered */
+ dev_info(dev, "vmclock: Neither miscdev nor PTP available; not registering\n");
+ ret = -ENODEV;
+ goto out;
+ }
+
+ dev_info(dev, "%s: registered %s%s%s\n", st->name,
+ st->miscdev.minor ? "miscdev" : "",
+ (st->miscdev.minor && st->ptp_clock) ? ", " : "",
+ st->ptp_clock ? "PTP" : "");
+
+ dev_set_drvdata(dev, st);
+
+ out:
+ return ret;
+}
+
+static const struct acpi_device_id vmclock_acpi_ids[] = {
+ { "AMZNC10C", 0 },
+ {}
+};
+MODULE_DEVICE_TABLE(acpi, vmclock_acpi_ids);
+
+static struct platform_driver vmclock_platform_driver = {
+ .probe = vmclock_probe,
+ .remove_new = vmclock_remove,
+ .driver = {
+ .name = "vmclock",
+ .acpi_match_table = vmclock_acpi_ids,
+ },
+};
+
+module_platform_driver(vmclock_platform_driver)
+
+MODULE_AUTHOR("David Woodhouse <dwmw2@infradead.org>");
+MODULE_DESCRIPTION("PTP clock using VMCLOCK");
+MODULE_LICENSE("GPL");
new file mode 100644
@@ -0,0 +1,182 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-2-Clause) */
+
+/*
+ * This structure provides a vDSO-style clock to VM guests, exposing the
+ * relationship (or lack thereof) between the CPU clock (TSC, timebase, arch
+ * counter, etc.) and real time. It is designed to address the problem of
+ * live migration, which other clock enlightenments do not.
+ *
+ * When a guest is live migrated, this affects the clock in two ways.
+ *
+ * First, even between identical hosts the actual frequency of the underlying
+ * counter will change within the tolerances of its specification (typically
+ * ±50PPM, or 4 seconds a day). This frequency also varies over time on the
+ * same host, but can be tracked by NTP as it generally varies slowly. With
+ * live migration there is a step change in the frequency, with no warning.
+ *
+ * Second, there may be a step change in the value of the counter itself, as
+ * its accuracy is limited by the precision of the NTP synchronization on the
+ * source and destination hosts.
+ *
+ * So any calibration (NTP, PTP, etc.) which the guest has done on the source
+ * host before migration is invalid, and needs to be redone on the new host.
+ *
+ * In its most basic mode, this structure provides only an indication to the
+ * guest that live migration has occurred. This allows the guest to know that
+ * its clock is invalid and take remedial action. For applications that need
+ * reliable accurate timestamps (e.g. distributed databases), the structure
+ * can be mapped all the way to userspace. This allows the application to see
+ * directly for itself that the clock is disrupted and take appropriate
+ * action, even when using a vDSO-style method to get the time instead of a
+ * system call.
+ *
+ * In its more advanced mode. this structure can also be used to expose the
+ * precise relationship of the CPU counter to real time, as calibrated by the
+ * host. This means that userspace applications can have accurate time
+ * immediately after live migration, rather than having to pause operations
+ * and wait for NTP to recover. This mode does, of course, rely on the
+ * counter being reliable and consistent across CPUs.
+ *
+ * Note that this must be true UTC, never with smeared leap seconds. If a
+ * guest wishes to construct a smeared clock, it can do so. Presenting a
+ * smeared clock through this interface would be problematic because it
+ * actually messes with the apparent counter *period*. A linear smearing
+ * of 1 ms per second would effectively tweak the counter period by 1000PPM
+ * at the start/end of the smearing period, while a sinusoidal smear would
+ * basically be impossible to represent.
+ *
+ * This structure is offered with the intent that it be adopted into the
+ * nascent virtio-rtc standard, as a virtio-rtc that does not address the live
+ * migration problem seems a little less than fit for purpose. For that
+ * reason, certain fields use precisely the same numeric definitions as in
+ * the virtio-rtc proposal. The structure can also be exposed through an ACPI
+ * device with the CID "VMCLOCK", modelled on the "VMGENID" device except for
+ * the fact that it uses a real _CRS to convey the address of the structure
+ * (which should be a full page, to allow for mapping directly to userspace).
+ */
+
+#ifndef __VMCLOCK_ABI_H__
+#define __VMCLOCK_ABI_H__
+
+#include <linux/types.h>
+
+struct vmclock_abi {
+ /* CONSTANT FIELDS */
+ __le32 magic;
+#define VMCLOCK_MAGIC 0x4b4c4356 /* "VCLK" */
+ __le32 size; /* Size of region containing this structure */
+ __le16 version; /* 1 */
+ __u8 counter_id; /* Matches VIRTIO_RTC_COUNTER_xxx except INVALID */
+#define VMCLOCK_COUNTER_ARM_VCNT 0
+#define VMCLOCK_COUNTER_X86_TSC 1
+#define VMCLOCK_COUNTER_INVALID 0xff
+ __u8 time_type; /* Matches VIRTIO_RTC_TYPE_xxx */
+#define VMCLOCK_TIME_UTC 0 /* Since 1970-01-01 00:00:00z */
+#define VMCLOCK_TIME_TAI 1 /* Since 1970-01-01 00:00:00z */
+#define VMCLOCK_TIME_MONOTONIC 2 /* Since undefined epoch */
+#define VMCLOCK_TIME_INVALID_SMEARED 3 /* Not supported */
+#define VMCLOCK_TIME_INVALID_MAYBE_SMEARED 4 /* Not supported */
+
+ /* NON-CONSTANT FIELDS PROTECTED BY SEQCOUNT LOCK */
+ __le32 seq_count; /* Low bit means an update is in progress */
+ /*
+ * This field changes to another non-repeating value when the CPU
+ * counter is disrupted, for example on live migration. This lets
+ * the guest know that it should discard any calibration it has
+ * performed of the counter against external sources (NTP/PTP/etc.).
+ */
+ __le64 disruption_marker;
+ __le64 flags;
+ /* Indicates that the tai_offset_sec field is valid */
+#define VMCLOCK_FLAG_TAI_OFFSET_VALID (1 << 0)
+ /*
+ * Optionally used to notify guests of pending maintenance events.
+ * A guest which provides latency-sensitive services may wish to
+ * remove itself from service if an event is coming up. Two flags
+ * indicate the approximate imminence of the event.
+ */
+#define VMCLOCK_FLAG_DISRUPTION_SOON (1 << 1) /* About a day */
+#define VMCLOCK_FLAG_DISRUPTION_IMMINENT (1 << 2) /* About an hour */
+#define VMCLOCK_FLAG_PERIOD_ESTERROR_VALID (1 << 3)
+#define VMCLOCK_FLAG_PERIOD_MAXERROR_VALID (1 << 4)
+#define VMCLOCK_FLAG_TIME_ESTERROR_VALID (1 << 5)
+#define VMCLOCK_FLAG_TIME_MAXERROR_VALID (1 << 6)
+ /*
+ * If the MONOTONIC flag is set then (other than leap seconds) it is
+ * guaranteed that the time calculated according this structure at
+ * any given moment shall never appear to be later than the time
+ * calculated via the structure at any *later* moment.
+ *
+ * In particular, a timestamp based on a counter reading taken
+ * immediately after setting the low bit of seq_count (and the
+ * associated memory barrier), using the previously-valid time and
+ * period fields, shall never be later than a timestamp based on
+ * a counter reading taken immediately before *clearing* the low
+ * bit again after the update, using the about-to-be-valid fields.
+ */
+#define VMCLOCK_FLAG_TIME_MONOTONIC (1 << 7)
+
+ __u8 pad[2];
+ __u8 clock_status;
+#define VMCLOCK_STATUS_UNKNOWN 0
+#define VMCLOCK_STATUS_INITIALIZING 1
+#define VMCLOCK_STATUS_SYNCHRONIZED 2
+#define VMCLOCK_STATUS_FREERUNNING 3
+#define VMCLOCK_STATUS_UNRELIABLE 4
+
+ /*
+ * The time exposed through this device is never smeared. This field
+ * corresponds to the 'subtype' field in virtio-rtc, which indicates
+ * the smearing method. However in this case it provides a *hint* to
+ * the guest operating system, such that *if* the guest OS wants to
+ * provide its users with an alternative clock which does not follow
+ * UTC, it may do so in a fashion consistent with the other systems
+ * in the nearby environment.
+ */
+ __u8 leap_second_smearing_hint; /* Matches VIRTIO_RTC_SUBTYPE_xxx */
+#define VMCLOCK_SMEARING_STRICT 0
+#define VMCLOCK_SMEARING_NOON_LINEAR 1
+#define VMCLOCK_SMEARING_UTC_SLS 2
+ __le16 tai_offset_sec; /* Actually two's complement signed */
+ __u8 leap_indicator;
+ /*
+ * This field is based on the VIRTIO_RTC_LEAP_xxx values as defined
+ * in the current draft of virtio-rtc, but since smearing cannot be
+ * used with the shared memory device, some values are not used.
+ *
+ * The _POST_POS and _POST_NEG values allow the guest to perform
+ * its own smearing during the day or so after a leap second when
+ * such smearing may need to continue being applied for a leap
+ * second which is now theoretically "historical".
+ */
+#define VMCLOCK_LEAP_NONE 0x00 /* No known nearby leap second */
+#define VMCLOCK_LEAP_PRE_POS 0x01 /* Positive leap second at EOM */
+#define VMCLOCK_LEAP_PRE_NEG 0x02 /* Negative leap second at EOM */
+#define VMCLOCK_LEAP_POS 0x03 /* Set during 23:59:60 second */
+#define VMCLOCK_LEAP_POST_POS 0x04
+#define VMCLOCK_LEAP_POST_NEG 0x05
+
+ /* Bit shift for counter_period_frac_sec and its error rate */
+ __u8 counter_period_shift;
+ /*
+ * Paired values of counter and UTC at a given point in time.
+ */
+ __le64 counter_value;
+ /*
+ * Counter period, and error margin of same. The unit of these
+ * fields is 1/2^(64 + counter_period_shift) of a second.
+ */
+ __le64 counter_period_frac_sec;
+ __le64 counter_period_esterror_rate_frac_sec;
+ __le64 counter_period_maxerror_rate_frac_sec;
+
+ /*
+ * Time according to time_type field above.
+ */
+ __le64 time_sec; /* Seconds since time_type epoch */
+ __le64 time_frac_sec; /* Units of 1/2^64 of a second */
+ __le64 time_esterror_nanosec;
+ __le64 time_maxerror_nanosec;
+};
+
+#endif /* __VMCLOCK_ABI_H__ */