diff mbox series

[v7,3/7] KVM: arm64: Allow userspace to configure a vCPU's virtual offset

Message ID 20210816001217.3063400-4-oupton@google.com (mailing list archive)
State New, archived
Headers show
Series KVM: arm64: Add idempotent controls to migrate guest counter | expand

Commit Message

Oliver Upton Aug. 16, 2021, 12:12 a.m. UTC
Allow userspace to access the guest's virtual counter-timer offset
through the ONE_REG interface. The value read or written is defined to
be an offset from the guest's physical counter-timer. Add some
documentation to clarify how a VMM should use this and the existing
CNTVCT_EL0.

Signed-off-by: Oliver Upton <oupton@google.com>
Reviewed-by: Andrew Jones <drjones@redhat.com>
---
 Documentation/virt/kvm/api.rst    | 10 ++++++++++
 arch/arm64/include/uapi/asm/kvm.h |  1 +
 arch/arm64/kvm/arch_timer.c       | 23 +++++++++++++++++++++++
 arch/arm64/kvm/guest.c            |  6 +++++-
 include/kvm/arm_arch_timer.h      |  1 +
 5 files changed, 40 insertions(+), 1 deletion(-)

Comments

Marc Zyngier Aug. 19, 2021, 9:11 a.m. UTC | #1
On Mon, 16 Aug 2021 01:12:13 +0100,
Oliver Upton <oupton@google.com> wrote:
> 
> Allow userspace to access the guest's virtual counter-timer offset
> through the ONE_REG interface. The value read or written is defined to
> be an offset from the guest's physical counter-timer. Add some
> documentation to clarify how a VMM should use this and the existing
> CNTVCT_EL0.
> 
> Signed-off-by: Oliver Upton <oupton@google.com>
> Reviewed-by: Andrew Jones <drjones@redhat.com>
> ---
>  Documentation/virt/kvm/api.rst    | 10 ++++++++++
>  arch/arm64/include/uapi/asm/kvm.h |  1 +
>  arch/arm64/kvm/arch_timer.c       | 23 +++++++++++++++++++++++
>  arch/arm64/kvm/guest.c            |  6 +++++-
>  include/kvm/arm_arch_timer.h      |  1 +
>  5 files changed, 40 insertions(+), 1 deletion(-)
> 
> diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
> index dae68e68ca23..adb04046a752 100644
> --- a/Documentation/virt/kvm/api.rst
> +++ b/Documentation/virt/kvm/api.rst
> @@ -2463,6 +2463,16 @@ arm64 system registers have the following id bit patterns::
>       derived from the register encoding for CNTV_CVAL_EL0.  As this is
>       API, it must remain this way.
>  
> +.. warning::
> +
> +     The value of KVM_REG_ARM_TIMER_OFFSET is defined as an offset from
> +     the guest's view of the physical counter-timer.
> +
> +     Userspace should use either KVM_REG_ARM_TIMER_OFFSET or
> +     KVM_REG_ARM_TIMER_CNT to pause and resume a guest's virtual
> +     counter-timer. Mixed use of these registers could result in an
> +     unpredictable guest counter value.
> +
>  arm64 firmware pseudo-registers have the following bit pattern::
>  
>    0x6030 0000 0014 <regno:16>
> diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
> index b3edde68bc3e..949a31bc10f0 100644
> --- a/arch/arm64/include/uapi/asm/kvm.h
> +++ b/arch/arm64/include/uapi/asm/kvm.h
> @@ -255,6 +255,7 @@ struct kvm_arm_copy_mte_tags {
>  #define KVM_REG_ARM_TIMER_CTL		ARM64_SYS_REG(3, 3, 14, 3, 1)
>  #define KVM_REG_ARM_TIMER_CVAL		ARM64_SYS_REG(3, 3, 14, 0, 2)
>  #define KVM_REG_ARM_TIMER_CNT		ARM64_SYS_REG(3, 3, 14, 3, 2)
> +#define KVM_REG_ARM_TIMER_OFFSET	ARM64_SYS_REG(3, 4, 14, 0, 3)
>

Andrew, does this warrant an update to the selftest that checks for
sysreg visibility?

I am also wondering how a VMM such as QEMU is going to deal with the
above restriction, given the way it blindly saves/restores all the
registers that KVM exposes, hence hitting that mixed-use that the
documentation warns about...

Thanks,

	M.
Andrew Jones Aug. 19, 2021, 10:20 a.m. UTC | #2
On Thu, Aug 19, 2021 at 10:11:09AM +0100, Marc Zyngier wrote:
> On Mon, 16 Aug 2021 01:12:13 +0100,
> Oliver Upton <oupton@google.com> wrote:
> > 
> > Allow userspace to access the guest's virtual counter-timer offset
> > through the ONE_REG interface. The value read or written is defined to
> > be an offset from the guest's physical counter-timer. Add some
> > documentation to clarify how a VMM should use this and the existing
> > CNTVCT_EL0.
> > 
> > Signed-off-by: Oliver Upton <oupton@google.com>
> > Reviewed-by: Andrew Jones <drjones@redhat.com>
> > ---
> >  Documentation/virt/kvm/api.rst    | 10 ++++++++++
> >  arch/arm64/include/uapi/asm/kvm.h |  1 +
> >  arch/arm64/kvm/arch_timer.c       | 23 +++++++++++++++++++++++
> >  arch/arm64/kvm/guest.c            |  6 +++++-
> >  include/kvm/arm_arch_timer.h      |  1 +
> >  5 files changed, 40 insertions(+), 1 deletion(-)
> > 
> > diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
> > index dae68e68ca23..adb04046a752 100644
> > --- a/Documentation/virt/kvm/api.rst
> > +++ b/Documentation/virt/kvm/api.rst
> > @@ -2463,6 +2463,16 @@ arm64 system registers have the following id bit patterns::
> >       derived from the register encoding for CNTV_CVAL_EL0.  As this is
> >       API, it must remain this way.
> >  
> > +.. warning::
> > +
> > +     The value of KVM_REG_ARM_TIMER_OFFSET is defined as an offset from
> > +     the guest's view of the physical counter-timer.
> > +
> > +     Userspace should use either KVM_REG_ARM_TIMER_OFFSET or
> > +     KVM_REG_ARM_TIMER_CNT to pause and resume a guest's virtual
> > +     counter-timer. Mixed use of these registers could result in an
> > +     unpredictable guest counter value.
> > +
> >  arm64 firmware pseudo-registers have the following bit pattern::
> >  
> >    0x6030 0000 0014 <regno:16>
> > diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
> > index b3edde68bc3e..949a31bc10f0 100644
> > --- a/arch/arm64/include/uapi/asm/kvm.h
> > +++ b/arch/arm64/include/uapi/asm/kvm.h
> > @@ -255,6 +255,7 @@ struct kvm_arm_copy_mte_tags {
> >  #define KVM_REG_ARM_TIMER_CTL		ARM64_SYS_REG(3, 3, 14, 3, 1)
> >  #define KVM_REG_ARM_TIMER_CVAL		ARM64_SYS_REG(3, 3, 14, 0, 2)
> >  #define KVM_REG_ARM_TIMER_CNT		ARM64_SYS_REG(3, 3, 14, 3, 2)
> > +#define KVM_REG_ARM_TIMER_OFFSET	ARM64_SYS_REG(3, 4, 14, 0, 3)
> >
> 
> Andrew, does this warrant an update to the selftest that checks for
> sysreg visibility?

Yup, until we do, the test will emit a warning with a suggestion to add
the new register to the list. It won't be a test FAIL, because adding new
registers doesn't break migration from older kernels, but we might as well
update the list sooner than later.

> 
> I am also wondering how a VMM such as QEMU is going to deal with the
> above restriction, given the way it blindly saves/restores all the
> registers that KVM exposes, hence hitting that mixed-use that the
> documentation warns about...

You're right and I think it's a problem. While we can special case
registers in QEMU using a cpreg "level" so they won't get saved/restored
all the time, it doesn't help here since we won't be special casing
KVM_REG_ARM_TIMER_OFFSET in older QEMU. We need a way for the VMM to opt
in to using KVM_REG_ARM_TIMER_OFFSET, such as with a CAP we can enable.

Thanks,
drew
Oliver Upton Aug. 29, 2021, 2:35 a.m. UTC | #3
On Mon, Aug 16, 2021 at 12:12:13AM +0000, Oliver Upton wrote:
> Allow userspace to access the guest's virtual counter-timer offset
> through the ONE_REG interface. The value read or written is defined to
> be an offset from the guest's physical counter-timer. Add some
> documentation to clarify how a VMM should use this and the existing
> CNTVCT_EL0.
> 
> Signed-off-by: Oliver Upton <oupton@google.com>
> Reviewed-by: Andrew Jones <drjones@redhat.com>

Hrm...

I was mulling on this patch a bit more and had a thought. As previously
discussed, the patch implements virtual offsets by broadcasting the same
offset to all vCPUs in a guest. I wonder if this may tolerate bad
practices from userspace that will break when KVM supports NV.

Consider that a nested guest may set CNTVOFF_EL2 to whatever value it
wants. Presumably, we will need to patch the handling of CNTVOFF_EL2 to
*not* broadcast to all vCPUs to save/restore NV properly. If a maligned
VMM only wrote to a single vCPU, banking on this broadcasting
implementation, it will fall flat on its face when handling an NV guest.

So, should we preemptively move to the new way of the world, wherein
userspace accesses to CNTVOFF_EL2 are vCPU-local rather than VM-wide?

No strong opinions in either direction, but figured I'd address it since
I'll need to respin this series anyway to fix ECV+nVHE.

--
Thanks,
Oliver
Marc Zyngier Sept. 6, 2021, 9:02 a.m. UTC | #4
On Sun, 29 Aug 2021 03:35:30 +0100,
Oliver Upton <oupton@google.com> wrote:
> 
> On Mon, Aug 16, 2021 at 12:12:13AM +0000, Oliver Upton wrote:
> > Allow userspace to access the guest's virtual counter-timer offset
> > through the ONE_REG interface. The value read or written is defined to
> > be an offset from the guest's physical counter-timer. Add some
> > documentation to clarify how a VMM should use this and the existing
> > CNTVCT_EL0.
> > 
> > Signed-off-by: Oliver Upton <oupton@google.com>
> > Reviewed-by: Andrew Jones <drjones@redhat.com>
> 
> Hrm...
> 
> I was mulling on this patch a bit more and had a thought. As previously
> discussed, the patch implements virtual offsets by broadcasting the same
> offset to all vCPUs in a guest. I wonder if this may tolerate bad
> practices from userspace that will break when KVM supports NV.
> 
> Consider that a nested guest may set CNTVOFF_EL2 to whatever value it
> wants. Presumably, we will need to patch the handling of CNTVOFF_EL2 to
> *not* broadcast to all vCPUs to save/restore NV properly. If a maligned
> VMM only wrote to a single vCPU, banking on this broadcasting
> implementation, it will fall flat on its face when handling an NV guest.
> 
> So, should we preemptively move to the new way of the world, wherein
> userspace accesses to CNTVOFF_EL2 are vCPU-local rather than VM-wide?
> 
> No strong opinions in either direction, but figured I'd address it since
> I'll need to respin this series anyway to fix ECV+nVHE.

Thought about this a bit more whilst being away from a computer...

It all boils down to what we expose as an abstraction of a machine. If
there is no EL2 in the VM, then there shouldn't be any way for the
guest to observe different values for the counters as seen from
different vcpus. That's what the architecture guarantees for a
physical system, and we shouldn't deviate from that. Opening the door
for userspace to do anything differently is a recipe for disaster.

It actually is an argument in favour of setting the various offsets to
a value that keep the two physical and virtual counters in sync,
instead of the current behaviour that allows different values to be
observed.

The above is in contrast with what the architecture allows when EL2 is
present. The hypervisor can naturally deal with the offsets as it sees
fit, and no offset should have any bearing on it (this later point is
of course to be moderated by CNTPOFF on the host).

To sum it up, I'd rather keep the CNTVOFF behaviour what it is today
for guest that have their highest exception level at EL1. For EL2
guests, the setting will obviously have to become per-CPU.

	M.
diff mbox series

Patch

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index dae68e68ca23..adb04046a752 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -2463,6 +2463,16 @@  arm64 system registers have the following id bit patterns::
      derived from the register encoding for CNTV_CVAL_EL0.  As this is
      API, it must remain this way.
 
+.. warning::
+
+     The value of KVM_REG_ARM_TIMER_OFFSET is defined as an offset from
+     the guest's view of the physical counter-timer.
+
+     Userspace should use either KVM_REG_ARM_TIMER_OFFSET or
+     KVM_REG_ARM_TIMER_CNT to pause and resume a guest's virtual
+     counter-timer. Mixed use of these registers could result in an
+     unpredictable guest counter value.
+
 arm64 firmware pseudo-registers have the following bit pattern::
 
   0x6030 0000 0014 <regno:16>
diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
index b3edde68bc3e..949a31bc10f0 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -255,6 +255,7 @@  struct kvm_arm_copy_mte_tags {
 #define KVM_REG_ARM_TIMER_CTL		ARM64_SYS_REG(3, 3, 14, 3, 1)
 #define KVM_REG_ARM_TIMER_CVAL		ARM64_SYS_REG(3, 3, 14, 0, 2)
 #define KVM_REG_ARM_TIMER_CNT		ARM64_SYS_REG(3, 3, 14, 3, 2)
+#define KVM_REG_ARM_TIMER_OFFSET	ARM64_SYS_REG(3, 4, 14, 0, 3)
 
 /* KVM-as-firmware specific pseudo-registers */
 #define KVM_REG_ARM_FW			(0x0014 << KVM_REG_ARM_COPROC_SHIFT)
diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c
index cf2f4a034dbe..9d9bac3ec40e 100644
--- a/arch/arm64/kvm/arch_timer.c
+++ b/arch/arm64/kvm/arch_timer.c
@@ -92,6 +92,18 @@  static u64 timer_get_offset(struct arch_timer_context *ctxt)
 	}
 }
 
+static u64 timer_get_guest_offset(struct arch_timer_context *ctxt)
+{
+	struct kvm_vcpu *vcpu = ctxt->vcpu;
+
+	switch (arch_timer_ctx_index(ctxt)) {
+	case TIMER_VTIMER:
+		return __vcpu_sys_reg(vcpu, CNTVOFF_EL2);
+	default:
+		return 0;
+	}
+}
+
 static void timer_set_ctl(struct arch_timer_context *ctxt, u32 ctl)
 {
 	struct kvm_vcpu *vcpu = ctxt->vcpu;
@@ -852,6 +864,10 @@  int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value)
 		timer = vcpu_vtimer(vcpu);
 		kvm_arm_timer_write(vcpu, timer, TIMER_REG_CVAL, value);
 		break;
+	case KVM_REG_ARM_TIMER_OFFSET:
+		timer = vcpu_vtimer(vcpu);
+		update_vtimer_cntvoff(vcpu, value);
+		break;
 	case KVM_REG_ARM_PTIMER_CTL:
 		timer = vcpu_ptimer(vcpu);
 		kvm_arm_timer_write(vcpu, timer, TIMER_REG_CTL, value);
@@ -896,6 +912,9 @@  u64 kvm_arm_timer_get_reg(struct kvm_vcpu *vcpu, u64 regid)
 	case KVM_REG_ARM_TIMER_CVAL:
 		return kvm_arm_timer_read(vcpu,
 					  vcpu_vtimer(vcpu), TIMER_REG_CVAL);
+	case KVM_REG_ARM_TIMER_OFFSET:
+		return kvm_arm_timer_read(vcpu,
+					  vcpu_vtimer(vcpu), TIMER_REG_OFFSET);
 	case KVM_REG_ARM_PTIMER_CTL:
 		return kvm_arm_timer_read(vcpu,
 					  vcpu_ptimer(vcpu), TIMER_REG_CTL);
@@ -933,6 +952,10 @@  static u64 kvm_arm_timer_read(struct kvm_vcpu *vcpu,
 		val = kvm_phys_timer_read() - timer_get_offset(timer);
 		break;
 
+	case TIMER_REG_OFFSET:
+		val = timer_get_guest_offset(timer);
+		break;
+
 	default:
 		BUG();
 	}
diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index 1dfb83578277..17fc06e2b422 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -591,7 +591,7 @@  static unsigned long num_core_regs(const struct kvm_vcpu *vcpu)
  * ARM64 versions of the TIMER registers, always available on arm64
  */
 
-#define NUM_TIMER_REGS 3
+#define NUM_TIMER_REGS 4
 
 static bool is_timer_reg(u64 index)
 {
@@ -599,6 +599,7 @@  static bool is_timer_reg(u64 index)
 	case KVM_REG_ARM_TIMER_CTL:
 	case KVM_REG_ARM_TIMER_CNT:
 	case KVM_REG_ARM_TIMER_CVAL:
+	case KVM_REG_ARM_TIMER_OFFSET:
 		return true;
 	}
 	return false;
@@ -614,6 +615,9 @@  static int copy_timer_indices(struct kvm_vcpu *vcpu, u64 __user *uindices)
 	uindices++;
 	if (put_user(KVM_REG_ARM_TIMER_CVAL, uindices))
 		return -EFAULT;
+	uindices++;
+	if (put_user(KVM_REG_ARM_TIMER_OFFSET, uindices))
+		return -EFAULT;
 
 	return 0;
 }
diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h
index 9d65d4a29f81..615f9314f6a5 100644
--- a/include/kvm/arm_arch_timer.h
+++ b/include/kvm/arm_arch_timer.h
@@ -21,6 +21,7 @@  enum kvm_arch_timer_regs {
 	TIMER_REG_CVAL,
 	TIMER_REG_TVAL,
 	TIMER_REG_CTL,
+	TIMER_REG_OFFSET,
 };
 
 struct arch_timer_context {