@@ -846,7 +846,8 @@ int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval)
/*
* We can't update the state of a "pushed" VCPU, but that
- * shouldn't happen.
+ * shouldn't happen because the vcpu->mutex makes running a
+ * vcpu mutually exclusive with doing one_reg get/set on it.
*/
if (WARN_ON(vcpu->arch.xive_pushed))
return -EIO;
@@ -1835,7 +1836,7 @@ void kvmppc_xive_free_sources(struct kvmppc_xive_src_block *sb)
}
/*
- * Called when device fd is closed
+ * Called when device fd is closed. kvm->lock is held.
*/
static void kvmppc_xive_release(struct kvm_device *dev)
{
@@ -1843,16 +1844,42 @@ static void kvmppc_xive_release(struct kvm_device *dev)
struct kvm *kvm = xive->kvm;
struct kvm_vcpu *vcpu;
int i;
+ int was_ready;
pr_devel("Releasing xive device\n");
/*
- * When releasing the KVM device fd, the vCPUs can still be
- * running and we should clean up the vCPU interrupt
- * presenters first.
+ * Clearing mmu_ready temporarily while holding kvm->lock
+ * is a way of ensuring that no vcpus can enter the guest
+ * until we drop kvm->lock. Doing kick_all_cpus_sync()
+ * ensures that any vcpu executing inside the guest has
+ * exited the guest. Once kick_all_cpus_sync() has finished,
+ * we know that no vcpu can be executing the XIVE push or
+ * pull code, or executing a XICS hcall.
+ *
+ * Since this is the device release function, we know that
+ * userspace does not have any open fd referring to the
+ * device. Therefore there can not be any of the device
+ * attribute set/get functions being executed concurrently,
+ * and similarly, the connect_vcpu and set/clr_mapped
+ * functions also cannot be being executed.
+ */
+ was_ready = kvm->arch.mmu_ready;
+ kvm->arch.mmu_ready = 0;
+ kick_all_cpus_sync();
+
+ /*
+ * We should clean up the vCPU interrupt presenters first.
*/
- kvm_for_each_vcpu(i, vcpu, kvm)
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ /*
+ * Take vcpu->mutex to ensure that no one_reg get/set ioctl
+ * (i.e. kvmppc_xive_[gs]et_icp) can be done concurrently.
+ */
+ mutex_lock(&vcpu->mutex);
kvmppc_xive_cleanup_vcpu(vcpu);
+ mutex_unlock(&vcpu->mutex);
+ }
debugfs_remove(xive->dentry);
@@ -1870,6 +1897,8 @@ static void kvmppc_xive_release(struct kvm_device *dev)
if (xive->vp_base != XIVE_INVALID_VP)
xive_native_free_vp_block(xive->vp_base);
+ kvm->arch.mmu_ready = was_ready;
+
/*
* A reference of the kvmppc_xive pointer is now kept under
* the xive_devices struct of the machine for reuse. It is
@@ -1906,6 +1935,9 @@ struct kvmppc_xive *kvmppc_xive_get_device(struct kvm *kvm, u32 type)
return xive;
}
+/*
+ * Create a XICS device with XIVE backend. kvm->lock is held.
+ */
static int kvmppc_xive_create(struct kvm_device *dev, u32 type)
{
struct kvmppc_xive *xive;
@@ -973,18 +973,45 @@ static void kvmppc_xive_native_release(struct kvm_device *dev)
struct kvm *kvm = xive->kvm;
struct kvm_vcpu *vcpu;
int i;
+ int was_ready;
debugfs_remove(xive->dentry);
pr_devel("Releasing xive native device\n");
/*
- * When releasing the KVM device fd, the vCPUs can still be
- * running and we should clean up the vCPU interrupt
- * presenters first.
+ * Clearing mmu_ready temporarily while holding kvm->lock
+ * is a way of ensuring that no vcpus can enter the guest
+ * until we drop kvm->lock. Doing kick_all_cpus_sync()
+ * ensures that any vcpu executing inside the guest has
+ * exited the guest. Once kick_all_cpus_sync() has finished,
+ * we know that no vcpu can be executing the XIVE push or
+ * pull code or accessing the XIVE MMIO regions.
+ *
+ * Since this is the device release function, we know that
+ * userspace does not have any open fd or mmap referring to
+ * the device. Therefore there can not be any of the
+ * device attribute set/get, mmap, or page fault functions
+ * being executed concurrently, and similarly, the
+ * connect_vcpu and set/clr_mapped functions also cannot
+ * be being executed.
*/
- kvm_for_each_vcpu(i, vcpu, kvm)
+ was_ready = kvm->arch.mmu_ready;
+ kvm->arch.mmu_ready = 0;
+ kick_all_cpus_sync();
+
+ /*
+ * We should clean up the vCPU interrupt presenters first.
+ */
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ /*
+ * Take vcpu->mutex to ensure that no one_reg get/set ioctl
+ * (i.e. kvmppc_xive_native_[gs]et_vp) can be being done.
+ */
+ mutex_lock(&vcpu->mutex);
kvmppc_xive_native_cleanup_vcpu(vcpu);
+ mutex_unlock(&vcpu->mutex);
+ }
if (kvm)
kvm->arch.xive = NULL;
@@ -999,6 +1026,8 @@ static void kvmppc_xive_native_release(struct kvm_device *dev)
if (xive->vp_base != XIVE_INVALID_VP)
xive_native_free_vp_block(xive->vp_base);
+ kvm->arch.mmu_ready = was_ready;
+
/*
* A reference of the kvmppc_xive pointer is now kept under
* the xive_devices struct of the machine for reuse. It is
@@ -1009,6 +1038,9 @@ static void kvmppc_xive_native_release(struct kvm_device *dev)
kfree(dev);
}
+/*
+ * Create a XIVE device. kvm->lock is held.
+ */
static int kvmppc_xive_native_create(struct kvm_device *dev, u32 type)
{
struct kvmppc_xive *xive;
Now that we have the possibility of a XIVE or XICS-on-XIVE device being released while the VM is still running, we need to be careful about races and potential use-after-free bugs. Although the kvmppc_xive struct is not freed, but kept around for re-use, the kvmppc_xive_vcpu structs are freed, and they are used extensively in both the XIVE native and XICS-on-XIVE code. There are various ways in which XIVE code gets invoked: - VCPU entry and exit, which do push and pull operations on the XIVE hardware - one_reg get and set functions (vcpu->mutex is held) - XICS hypercalls (but only inside guest execution, not from kvmppc_pseries_do_hcall) - device creation calls (kvm->lock is held) - device callbacks - get/set attribute, mmap, pagefault, release/destroy - set_mapped/clr_mapped calls (kvm->lock is held) - connect_vcpu calls - debugfs file read callbacks Inside a device release function, we know that userspace cannot have an open file descriptor referring to the device, nor can it have any mmapped regions from the device. Therefore the device callbacks are excluded, as are the connect_vcpu calls (since they need a fd for the device). Further, since the caller holds the kvm->lock mutex, no other device creation calls or set/clr_mapped calls can be executing concurrently. To exclude VCPU execution and XICS hypercalls, we temporarily set kvm->arch.mmu_ready to 0. This forces any VCPU task that is trying to enter the guest to take the kvm->lock mutex, which is held by the caller of the release function. Then, sending an IPI to all other CPUs forces any VCPU currently executing in the guest to exit. Finally, we take the vcpu->mutex for each VCPU around the process of cleaning up and freeing its XIVE data structures, in order to exclude any one_reg get/set calls. This patch does not address the races in the debugfs read callbacks. They will be addressed in a later patch. Signed-off-by: Paul Mackerras <paulus@ozlabs.org> --- Cedric, I think we need this on top of your patch 17/17, since you are freeing the kvmppc_xive_vcpu structs. With this, we can probably also free the kvmppc_xive struct, though I haven't yet completely satisfied myself about that. If you can see any entry point that I have missed (other than the debugfs stuff), let me know. I can either apply this on top of your series, or fold it into your 17/17, whichever you prefer. arch/powerpc/kvm/book3s_xive.c | 44 ++++++++++++++++++++++++++++++----- arch/powerpc/kvm/book3s_xive_native.c | 40 +++++++++++++++++++++++++++---- 2 files changed, 74 insertions(+), 10 deletions(-)