From patchwork Thu Jan 27 08:33:26 2011 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Nadav Har'El X-Patchwork-Id: 510521 Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by demeter1.kernel.org (8.14.4/8.14.3) with ESMTP id p0R8XZbh012516 for ; Thu, 27 Jan 2011 08:33:36 GMT Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1751972Ab1A0Idc (ORCPT ); Thu, 27 Jan 2011 03:33:32 -0500 Received: from mtagate1.uk.ibm.com ([194.196.100.161]:43877 "EHLO mtagate1.uk.ibm.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751916Ab1A0Idc (ORCPT ); Thu, 27 Jan 2011 03:33:32 -0500 Received: from d06nrmr1307.portsmouth.uk.ibm.com (d06nrmr1307.portsmouth.uk.ibm.com [9.149.38.129]) by mtagate1.uk.ibm.com (8.13.1/8.13.1) with ESMTP id p0R8XSr4005800 for ; Thu, 27 Jan 2011 08:33:28 GMT Received: from d06av05.portsmouth.uk.ibm.com (d06av05.portsmouth.uk.ibm.com [9.149.37.229]) by d06nrmr1307.portsmouth.uk.ibm.com (8.13.8/8.13.8/NCO v10.0) with ESMTP id p0R8XVWY1626288 for ; Thu, 27 Jan 2011 08:33:31 GMT Received: from d06av05.portsmouth.uk.ibm.com (loopback [127.0.0.1]) by d06av05.portsmouth.uk.ibm.com (8.14.4/8.13.1/NCO v10.0 AVout) with ESMTP id p0R8XRuM019264 for ; Thu, 27 Jan 2011 01:33:28 -0700 Received: from rice.haifa.ibm.com (rice.haifa.ibm.com [9.148.8.217]) by d06av05.portsmouth.uk.ibm.com (8.14.4/8.13.1/NCO v10.0 AVin) with ESMTP id p0R8XRbq019250 (version=TLSv1/SSLv3 cipher=DHE-RSA-AES256-SHA bits=256 verify=NO); Thu, 27 Jan 2011 01:33:27 -0700 Received: from rice.haifa.ibm.com (lnx-nyh.haifa.ibm.com [127.0.0.1]) by rice.haifa.ibm.com (8.14.4/8.14.4) with ESMTP id p0R8XQdn002482; Thu, 27 Jan 2011 10:33:26 +0200 Received: (from nyh@localhost) by rice.haifa.ibm.com (8.14.4/8.14.4/Submit) id p0R8XQ4w002480; Thu, 27 Jan 2011 10:33:26 +0200 Date: Thu, 27 Jan 2011 10:33:26 +0200 Message-Id: <201101270833.p0R8XQ4w002480@rice.haifa.ibm.com> X-Authentication-Warning: rice.haifa.ibm.com: nyh set sender to "Nadav Har'El" using -f Cc: gleb@redhat.com, avi@redhat.com To: kvm@vger.kernel.org From: "Nadav Har'El" References: <1296116987-nyh@il.ibm.com> Subject: [PATCH 07/29] nVMX: Hold a vmcs02 for each vmcs12 Sender: kvm-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: kvm@vger.kernel.org X-Greylist: IP, sender and recipient auto-whitelisted, not delayed by milter-greylist-4.2.6 (demeter1.kernel.org [140.211.167.41]); Thu, 27 Jan 2011 08:33:36 +0000 (UTC) --- .before/arch/x86/kvm/vmx.c 2011-01-26 18:06:03.000000000 +0200 +++ .after/arch/x86/kvm/vmx.c 2011-01-26 18:06:03.000000000 +0200 @@ -117,6 +117,7 @@ static int ple_window = KVM_VMX_DEFAULT_ module_param(ple_window, int, S_IRUGO); #define NR_AUTOLOAD_MSRS 1 +#define NESTED_MAX_VMCS 256 struct vmcs { u32 revision_id; @@ -159,6 +160,34 @@ struct __packed vmcs12 { #define VMCS12_REVISION 0x11e57ed0 /* + * When we temporarily switch a vcpu's VMCS (e.g., stop using an L1's VMCS + * while we use L2's VMCS), and wish to save the previous VMCS, we must also + * remember on which CPU it was last loaded (vcpu->cpu), so when we return to + * using this VMCS we'll know if we're now running on a different CPU and need + * to clear the VMCS on the old CPU, and load it on the new one. Additionally, + * we need to remember whether this VMCS was launched (vmx->launched), so when + * we return to it we know if to VMLAUNCH or to VMRESUME it (we cannot deduce + * this from other state, because it's possible that this VMCS had once been + * launched, but has since been cleared after a CPU switch, and now + * vmx->launch is 0. + */ +struct saved_vmcs { + struct vmcs *vmcs; + int cpu; + int launched; +}; + +/* + * A cache keeping a VMCS (vmcs02) for each loaded vmcs12. In addition to the + * VMCS, we need information on its state - see struct saved_vmcs above. + */ +struct vmcs_list { + struct list_head list; + gpa_t vmcs12_addr; + struct saved_vmcs vmcs02; +}; + +/* * The nested_vmx structure is part of vcpu_vmx, and holds information we need * for correct emulation of VMX (i.e., nested VMX) on this vcpu. For example, * the current VMCS set by L1, a list of the VMCSs used to run the active @@ -173,6 +202,10 @@ struct nested_vmx { /* The host-usable pointer to the above */ struct page *current_vmcs12_page; struct vmcs12 *current_vmcs12; + + /* list of real (hardware) VMCS, one for each L2 guest of L1 */ + struct list_head vmcs02_list; /* a vmcs_list */ + int vmcs02_num; }; struct vcpu_vmx { @@ -3964,6 +3997,110 @@ static int handle_invalid_op(struct kvm_ return 1; } +/* Find a vmcs02 saved for the current L2's vmcs12 */ +static struct saved_vmcs *nested_get_current_vmcs(struct vcpu_vmx *vmx) +{ + struct vmcs_list *item; + list_for_each_entry(item, &vmx->nested.vmcs02_list, list) + if (item->vmcs12_addr == vmx->nested.current_vmptr) + return &item->vmcs02; + return NULL; +} + +/* + * Allocate an L0 VMCS (vmcs02) for the current L1 VMCS (vmcs12), if one + * does not already exist. The allocation is done in L0 memory, so to avoid + * denial-of-service attack by guests, we limit the number of concurrently- + * allocated vmcss. A well-behaving L1 will VMCLEAR unused vmcs12s and not + * trigger this limit. + */ +static int nested_create_current_vmcs(struct kvm_vcpu *vcpu) +{ + struct vmcs_list *new_l2_guest; + struct vmcs *vmcs02; + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (nested_get_current_vmcs(vmx)) + return 0; /* nothing to do - we already have a VMCS */ + + if (vmx->nested.vmcs02_num >= NESTED_MAX_VMCS) + return -ENOMEM; + + new_l2_guest = (struct vmcs_list *) + kmalloc(sizeof(struct vmcs_list), GFP_KERNEL); + if (!new_l2_guest) + return -ENOMEM; + + vmcs02 = alloc_vmcs(); + if (!vmcs02) { + kfree(new_l2_guest); + return -ENOMEM; + } + + new_l2_guest->vmcs12_addr = vmx->nested.current_vmptr; + new_l2_guest->vmcs02.vmcs = vmcs02; + new_l2_guest->vmcs02.cpu = -1; + new_l2_guest->vmcs02.launched = 0; + list_add(&(new_l2_guest->list), &(vmx->nested.vmcs02_list)); + vmx->nested.vmcs02_num++; + return 0; +} + +static void __nested_free_saved_vmcs(void *arg) +{ + struct saved_vmcs *saved_vmcs = arg; + int cpu = raw_smp_processor_id(); + + if (saved_vmcs->cpu == cpu) /* TODO: how can this not be the case? */ + vmcs_clear(saved_vmcs->vmcs); + if (per_cpu(current_vmcs, cpu) == saved_vmcs->vmcs) + per_cpu(current_vmcs, cpu) = NULL; +} + +/* + * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded + * (the necessary information is in the saved_vmcs structure). + * See also vcpu_clear() (with different parameters and side-effects) + */ +static void nested_free_saved_vmcs(struct vcpu_vmx *vmx, + struct saved_vmcs *saved_vmcs) +{ + if (saved_vmcs->cpu != -1) + smp_call_function_single(saved_vmcs->cpu, + __nested_free_saved_vmcs, saved_vmcs, 1); + + free_vmcs(saved_vmcs->vmcs); +} + +/* + * Free a vmcs12's associated vmcs02 (if there is one), and remove it from + * vmcs02_list. + */ +static void nested_free_vmcs(struct vcpu_vmx *vmx, gpa_t vmptr) +{ + struct vmcs_list *item; + list_for_each_entry(item, &vmx->nested.vmcs02_list, list) + if (item->vmcs12_addr == vmptr) { + nested_free_saved_vmcs(vmx, &item->vmcs02); + list_del(&item->list); + kfree(item); + vmx->nested.vmcs02_num--; + return; + } +} + +/* Free all vmcs02 saved for this L1 vcpu */ +static void nested_free_all_vmcs(struct vcpu_vmx *vmx) +{ + struct vmcs_list *item, *n; + list_for_each_entry_safe(item, n, &vmx->nested.vmcs02_list, list) { + nested_free_saved_vmcs(vmx, &item->vmcs02); + list_del(&item->list); + kfree(item); + } + vmx->nested.vmcs02_num = 0; +} + /* * Emulate the VMXON instruction. * Currently, we just remember that VMX is active, and do not save or even @@ -4000,6 +4137,9 @@ static int handle_vmon(struct kvm_vcpu * return 1; } + INIT_LIST_HEAD(&(vmx->nested.vmcs02_list)); + vmx->nested.vmcs02_num = 0; + vmx->nested.vmxon = true; skip_emulated_instruction(vcpu); @@ -4050,6 +4190,8 @@ static void free_nested(struct vcpu_vmx nested_release_page(vmx->nested.current_vmcs12_page); vmx->nested.current_vmptr = -1ull; } + + nested_free_all_vmcs(vmx); } /* Emulate the VMXOFF instruction */