Message ID | 20110523185104.GA26899@fermat.math.technion.ac.il (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
> From: Nadav Har'El > Sent: Tuesday, May 24, 2011 2:51 AM > > > >+ vmcs_init(vmx->loaded_vmcs->vmcs); > > >+ vmx->loaded_vmcs->cpu = -1; > > >+ vmx->loaded_vmcs->launched = 0; > > > > Perhaps a loaded_vmcs_init() to encapsulate initialization of these > > three fields, you'll probably reuse it later. > > It's good you pointed this out, because it made me suddenly realise that I > forgot to VMCLEAR the new vmcs02's I allocate. In practice it never made a > difference, but better safe than sorry. yes, that's what spec requires. You need VMCLEAR on any new VMCS which does implementation specific initialization in that VMCS region. > > I had to restructure some of the code a bit to be able to properly use this > new function (in 3 places - __loaded_vmcs_clear, nested_get_current_vmcs02, > vmx_create_cpu). > > > Please repost separately after the fix, I'd like to apply it before the > > rest of the series. > > I am adding a new version of this patch at the end of this mail. > > > (regarding interrupts, I think we can do that work post-merge. But I'd > > like to see Kevin's comments addressed) > > I replied to his comments. Done some of the things he asked, and asked for > more info on why/where he believes the current code is incorrect where I > didn't understand what problems he pointed to, and am now waiting for him > to reply. As I replied in another thread, I believe this has been explained clearly by Nadav. > > > ------- 8< ------ 8< ---------- 8< ---------- 8< ----------- 8< ----------- > > Subject: [PATCH 01/31] nVMX: Keep list of loaded VMCSs, instead of vcpus. > > In VMX, before we bring down a CPU we must VMCLEAR all VMCSs loaded on it > because (at least in theory) the processor might not have written all of its > content back to memory. Since a patch from June 26, 2008, this is done using > a per-cpu "vcpus_on_cpu" linked list of vcpus loaded on each CPU. > > The problem is that with nested VMX, we no longer have the concept of a > vcpu being loaded on a cpu: A vcpu has multiple VMCSs (one for L1, a pool for > L2s), and each of those may be have been last loaded on a different cpu. > > So instead of linking the vcpus, we link the VMCSs, using a new structure > loaded_vmcs. This structure contains the VMCS, and the information > pertaining > to its loading on a specific cpu (namely, the cpu number, and whether it > was already launched on this cpu once). In nested we will also use the same > structure to hold L2 VMCSs, and vmx->loaded_vmcs is a pointer to the > currently active VMCS. > > Signed-off-by: Nadav Har'El <nyh@il.ibm.com> > --- > arch/x86/kvm/vmx.c | 150 ++++++++++++++++++++++++------------------- > 1 file changed, 86 insertions(+), 64 deletions(-) > > --- .before/arch/x86/kvm/vmx.c 2011-05-23 21:46:14.000000000 +0300 > +++ .after/arch/x86/kvm/vmx.c 2011-05-23 21:46:14.000000000 +0300 > @@ -116,6 +116,18 @@ struct vmcs { > char data[0]; > }; > > +/* > + * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also > + * remember whether it was VMLAUNCHed, and maintain a linked list of all > VMCSs > + * loaded on this CPU (so we can clear them if the CPU goes down). > + */ > +struct loaded_vmcs { > + struct vmcs *vmcs; > + int cpu; > + int launched; > + struct list_head loaded_vmcss_on_cpu_link; > +}; > + > struct shared_msr_entry { > unsigned index; > u64 data; > @@ -124,9 +136,7 @@ struct shared_msr_entry { > > struct vcpu_vmx { > struct kvm_vcpu vcpu; > - struct list_head local_vcpus_link; > unsigned long host_rsp; > - int launched; > u8 fail; > u8 cpl; > bool nmi_known_unmasked; > @@ -140,7 +150,14 @@ struct vcpu_vmx { > u64 msr_host_kernel_gs_base; > u64 msr_guest_kernel_gs_base; > #endif > - struct vmcs *vmcs; > + /* > + * loaded_vmcs points to the VMCS currently used in this vcpu. For a > + * non-nested (L1) guest, it always points to vmcs01. For a nested > + * guest (L2), it points to a different VMCS. > + */ > + struct loaded_vmcs vmcs01; > + struct loaded_vmcs *loaded_vmcs; > + bool __launched; /* temporary, used in > vmx_vcpu_run */ > struct msr_autoload { > unsigned nr; > struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS]; > @@ -200,7 +217,11 @@ static int vmx_set_tss_addr(struct kvm * > > static DEFINE_PER_CPU(struct vmcs *, vmxarea); > static DEFINE_PER_CPU(struct vmcs *, current_vmcs); > -static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu); > +/* > + * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is > needed > + * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded > on it. > + */ > +static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu); > static DEFINE_PER_CPU(struct desc_ptr, host_gdt); > > static unsigned long *vmx_io_bitmap_a; > @@ -501,6 +522,13 @@ static void vmcs_clear(struct vmcs *vmcs > vmcs, phys_addr); > } > > +static inline void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs) > +{ > + vmcs_clear(loaded_vmcs->vmcs); > + loaded_vmcs->cpu = -1; > + loaded_vmcs->launched = 0; > +} > + call it vmcs_init instead since you now remove original vmcs_init invocation, which more reflect the necessity of adding VMCLEAR for a new vmcs? > static void vmcs_load(struct vmcs *vmcs) > { > u64 phys_addr = __pa(vmcs); > @@ -514,25 +542,24 @@ static void vmcs_load(struct vmcs *vmcs) > vmcs, phys_addr); > } > > -static void __vcpu_clear(void *arg) > +static void __loaded_vmcs_clear(void *arg) > { > - struct vcpu_vmx *vmx = arg; > + struct loaded_vmcs *loaded_vmcs = arg; > int cpu = raw_smp_processor_id(); > > - if (vmx->vcpu.cpu == cpu) > - vmcs_clear(vmx->vmcs); > - if (per_cpu(current_vmcs, cpu) == vmx->vmcs) > + if (loaded_vmcs->cpu != cpu) > + return; /* cpu migration can race with cpu offline */ what do you mean by "cpu migration" here? why does 'cpu offline' matter here regarding to the cpu change? > + if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs) > per_cpu(current_vmcs, cpu) = NULL; > - list_del(&vmx->local_vcpus_link); > - vmx->vcpu.cpu = -1; > - vmx->launched = 0; > + list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link); > + loaded_vmcs_init(loaded_vmcs); > } > > -static void vcpu_clear(struct vcpu_vmx *vmx) > +static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) > { > - if (vmx->vcpu.cpu == -1) > - return; > - smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 1); > + if (loaded_vmcs->cpu != -1) > + smp_call_function_single( > + loaded_vmcs->cpu, __loaded_vmcs_clear, loaded_vmcs, 1); > } > > static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx) > @@ -971,22 +998,22 @@ static void vmx_vcpu_load(struct kvm_vcp > > if (!vmm_exclusive) > kvm_cpu_vmxon(phys_addr); > - else if (vcpu->cpu != cpu) > - vcpu_clear(vmx); > + else if (vmx->loaded_vmcs->cpu != cpu) > + loaded_vmcs_clear(vmx->loaded_vmcs); > > - if (per_cpu(current_vmcs, cpu) != vmx->vmcs) { > - per_cpu(current_vmcs, cpu) = vmx->vmcs; > - vmcs_load(vmx->vmcs); > + if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) { > + per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs; > + vmcs_load(vmx->loaded_vmcs->vmcs); > } > > - if (vcpu->cpu != cpu) { > + if (vmx->loaded_vmcs->cpu != cpu) { > struct desc_ptr *gdt = &__get_cpu_var(host_gdt); > unsigned long sysenter_esp; > > kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); > local_irq_disable(); > - list_add(&vmx->local_vcpus_link, > - &per_cpu(vcpus_on_cpu, cpu)); > + list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link, > + &per_cpu(loaded_vmcss_on_cpu, cpu)); > local_irq_enable(); > > /* > @@ -998,6 +1025,7 @@ static void vmx_vcpu_load(struct kvm_vcp > > rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); > vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ > + vmx->loaded_vmcs->cpu = cpu; > } > } > > @@ -1005,7 +1033,8 @@ static void vmx_vcpu_put(struct kvm_vcpu > { > __vmx_load_host_state(to_vmx(vcpu)); > if (!vmm_exclusive) { > - __vcpu_clear(to_vmx(vcpu)); > + __loaded_vmcs_clear(to_vmx(vcpu)->loaded_vmcs); > + vcpu->cpu = -1; > kvm_cpu_vmxoff(); > } > } > @@ -1469,7 +1498,7 @@ static int hardware_enable(void *garbage > if (read_cr4() & X86_CR4_VMXE) > return -EBUSY; > > - INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu)); > + INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); > rdmsrl(MSR_IA32_FEATURE_CONTROL, old); > > test_bits = FEATURE_CONTROL_LOCKED; > @@ -1493,14 +1522,14 @@ static int hardware_enable(void *garbage > return 0; > } > > -static void vmclear_local_vcpus(void) > +static void vmclear_local_loaded_vmcss(void) > { > int cpu = raw_smp_processor_id(); > - struct vcpu_vmx *vmx, *n; > + struct loaded_vmcs *v, *n; > > - list_for_each_entry_safe(vmx, n, &per_cpu(vcpus_on_cpu, cpu), > - local_vcpus_link) > - __vcpu_clear(vmx); > + list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu), > + loaded_vmcss_on_cpu_link) > + __loaded_vmcs_clear(v); > } > > > @@ -1515,7 +1544,7 @@ static void kvm_cpu_vmxoff(void) > static void hardware_disable(void *garbage) > { > if (vmm_exclusive) { > - vmclear_local_vcpus(); > + vmclear_local_loaded_vmcss(); > kvm_cpu_vmxoff(); > } > write_cr4(read_cr4() & ~X86_CR4_VMXE); > @@ -1696,6 +1725,18 @@ static void free_vmcs(struct vmcs *vmcs) > free_pages((unsigned long)vmcs, vmcs_config.order); > } > > +/* > + * Free a VMCS, but before that VMCLEAR it on the CPU where it was last > loaded > + */ > +static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) > +{ > + if (!loaded_vmcs->vmcs) > + return; > + loaded_vmcs_clear(loaded_vmcs); > + free_vmcs(loaded_vmcs->vmcs); > + loaded_vmcs->vmcs = NULL; > +} prefer to not do cleanup work through loaded_vmcs since it's just a pointer to a loaded vmcs structure. Though you can carefully arrange the nested vmcs cleanup happening before it, it's not very clean and a bit error prone simply from this function itself. It's clearer to directly cleanup vmcs01, and if you want an assertion could be added to make sure loaded_vmcs doesn't point to any stale vmcs02 structure after nested cleanup step. Thanks, Kevin -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, May 24, 2011, Tian, Kevin wrote about "RE: [PATCH 08/31] nVMX: Fix local_vcpus_link handling": > > +static inline void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs) > > +{ > > + vmcs_clear(loaded_vmcs->vmcs); > > + loaded_vmcs->cpu = -1; > > + loaded_vmcs->launched = 0; > > +} > > + > > call it vmcs_init instead since you now remove original vmcs_init invocation, > which more reflect the necessity of adding VMCLEAR for a new vmcs? The best name for this function, I think, would have been loaded_vmcs_clear, because this function isn't necessarily used to "init" - it's also called to VMCLEAR an old vmcs (and flush its content back to memory) - in that sense it is definitely not a "vmcs_init". Unfortunately, I already have a whole chain of functions with this name :( the existing loaded_vmcs_clear() does an IPI to the CPU which has this VMCS loaded, and causes it to run __loaded_vmcs_clear(), which in turn calls the above loaded_vmcs_init(). I wish I could call all three functions loaded_vmcs_clear(), but of course I can't. If anyone reading this has a good suggestion on how to name these three functions, please let me know. > > +static void __loaded_vmcs_clear(void *arg) > > { > > - struct vcpu_vmx *vmx = arg; > > + struct loaded_vmcs *loaded_vmcs = arg; > > int cpu = raw_smp_processor_id(); > > > > - if (vmx->vcpu.cpu == cpu) > > - vmcs_clear(vmx->vmcs); > > - if (per_cpu(current_vmcs, cpu) == vmx->vmcs) > > + if (loaded_vmcs->cpu != cpu) > > + return; /* cpu migration can race with cpu offline */ > > what do you mean by "cpu migration" here? why does 'cpu offline' > matter here regarding to the cpu change? __loaded_vmcs_clear() is typically called in one of two cases: "cpu migration" means that a guest that used to run on one CPU, and had its VMCS loaded there, suddenly needs to run on a different CPU, so we need to clear the VMCS on the old CPU. "cpu offline" means that we want to take a certain CPU offline, and before we do that we should VMCLEAR all VMCSs which were loaded on it. The (vmx->cpu.cpu != cpu) case in __loaded_vmcs_clear should ideally never happen: In the cpu offline path, we only call it for the loaded_vmcss which we know for sure are loaded on the current cpu. In the cpu migration path, loaded_vmcs_clear runs __loaded_vmcs_clear on the right CPU, which ensures that equality. But, there can be a race condition (this was actually explained to me a while back by Avi - I never seen this happening in practice): Imagine that cpu migration calls loaded_vmcs_clear, which tells the old cpu (via IPI) to VMCLEAR this vmcs. But before that old CPU gets a chance to act on that IPI, a decision is made to take it offline, and all loaded_vmcs loaded on it (including the one in question) are cleared. When that CPU acts on this IPI, it notices that vmx->cpu.cpu==-1, i.e., != cpu, so it doesn't need to do anything (in the new version of the code, I made this more explicit, by returning immediately in this case). At least this is the theory. As I said, I didn't see this problem in practice (unsurprising, since I never offlined any CPU). Maybe Avi or someone else can comment more about this (vmx->cpu.cpu == cpu) check, which existed before my patch - in __vcpu_clear(). > > +static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) > > +{ > > + if (!loaded_vmcs->vmcs) > > + return; > > + loaded_vmcs_clear(loaded_vmcs); > > + free_vmcs(loaded_vmcs->vmcs); > > + loaded_vmcs->vmcs = NULL; > > +} > > prefer to not do cleanup work through loaded_vmcs since it's just a pointer > to a loaded vmcs structure. Though you can carefully arrange the nested > vmcs cleanup happening before it, it's not very clean and a bit error prone > simply from this function itself. It's clearer to directly cleanup vmcs01, and > if you want an assertion could be added to make sure loaded_vmcs doesn't > point to any stale vmcs02 structure after nested cleanup step. I'm afraid I didn't understand what you meant here... Basically, this free_loaded_vmcs() is just a shortcut for loaded_vmcs_clear() and free_vmcs(), as doing both is needed in 3 places: nested_free_vmcs02, nested_free_all_saved_vmcss, vmx_free_vcpu. The same function is needed for both vmcs01 and vmcs02 VMCSs - in both cases when we don't need them any more we need to VMCLEAR them and then free the VMCS memory. Note that this function does *not* free the loaded_vmcs structure itself. What's wrong with this? Would you prefer that I remove this function and explictly call loaded_vmcs_clear() and then free_vmcs() in all three places? Thanks, Nadav.
> From: Nadav Har'El [mailto:nyh@math.technion.ac.il] > Sent: Tuesday, May 24, 2011 3:57 PM > > On Tue, May 24, 2011, Tian, Kevin wrote about "RE: [PATCH 08/31] nVMX: Fix > local_vcpus_link handling": > > > +static inline void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs) > > > +{ > > > + vmcs_clear(loaded_vmcs->vmcs); > > > + loaded_vmcs->cpu = -1; > > > + loaded_vmcs->launched = 0; > > > +} > > > + > > > > call it vmcs_init instead since you now remove original vmcs_init invocation, > > which more reflect the necessity of adding VMCLEAR for a new vmcs? > > The best name for this function, I think, would have been loaded_vmcs_clear, > because this function isn't necessarily used to "init" - it's also called to > VMCLEAR an old vmcs (and flush its content back to memory) - in that sense > it is definitely not a "vmcs_init". > > Unfortunately, I already have a whole chain of functions with this name :( > the existing loaded_vmcs_clear() does an IPI to the CPU which has this VMCS > loaded, and causes it to run __loaded_vmcs_clear(), which in turn calls > the above loaded_vmcs_init(). I wish I could call all three functions > loaded_vmcs_clear(), but of course I can't. If anyone reading this has a good > suggestion on how to name these three functions, please let me know. how about loaded_vmcs_reset? > > > > +static void __loaded_vmcs_clear(void *arg) > > > { > > > - struct vcpu_vmx *vmx = arg; > > > + struct loaded_vmcs *loaded_vmcs = arg; > > > int cpu = raw_smp_processor_id(); > > > > > > - if (vmx->vcpu.cpu == cpu) > > > - vmcs_clear(vmx->vmcs); > > > - if (per_cpu(current_vmcs, cpu) == vmx->vmcs) > > > + if (loaded_vmcs->cpu != cpu) > > > + return; /* cpu migration can race with cpu offline */ > > > > what do you mean by "cpu migration" here? why does 'cpu offline' > > matter here regarding to the cpu change? > > __loaded_vmcs_clear() is typically called in one of two cases: "cpu migration" > means that a guest that used to run on one CPU, and had its VMCS loaded > there, > suddenly needs to run on a different CPU, so we need to clear the VMCS on > the old CPU. "cpu offline" means that we want to take a certain CPU offline, > and before we do that we should VMCLEAR all VMCSs which were loaded on it. So here you need explicitly differentiate a vcpu and a real cpu. For the 1st case it's just 'vcpu migration", and the latter it's the real 'cpu offline'. 'cpu migration' is generally a RAS feature in mission critical world. :-) > > The (vmx->cpu.cpu != cpu) case in __loaded_vmcs_clear should ideally never > happen: In the cpu offline path, we only call it for the loaded_vmcss which > we know for sure are loaded on the current cpu. In the cpu migration path, > loaded_vmcs_clear runs __loaded_vmcs_clear on the right CPU, which ensures > that > equality. > > But, there can be a race condition (this was actually explained to me a while > back by Avi - I never seen this happening in practice): Imagine that cpu > migration calls loaded_vmcs_clear, which tells the old cpu (via IPI) to > VMCLEAR this vmcs. But before that old CPU gets a chance to act on that IPI, > a decision is made to take it offline, and all loaded_vmcs loaded on it > (including the one in question) are cleared. When that CPU acts on this IPI, > it notices that vmx->cpu.cpu==-1, i.e., != cpu, so it doesn't need to do > anything (in the new version of the code, I made this more explicit, by > returning immediately in this case). the reverse also holds true. Right between the point where cpu_offline hits a loaded_vmcs and the point where it calls __loaded_vmcs_clear, it's possible that the vcpu is migrated to another cpu, and it's likely that migration path (vmx_vcpu_load) has invoked loaded_vmcs_clear but hasn't delete this vmcs from old cpu's linked list. This way later when __loaded_vmcs_clear is invoked on the offlined cpu, there's still chance to observe cpu as -1. > > At least this is the theory. As I said, I didn't see this problem in practice > (unsurprising, since I never offlined any CPU). Maybe Avi or someone else can > comment more about this (vmx->cpu.cpu == cpu) check, which existed before > my patch - in __vcpu_clear(). I agree this check is necessary, but just want you to make the comment clear with right term. > > > > +static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) > > > +{ > > > + if (!loaded_vmcs->vmcs) > > > + return; > > > + loaded_vmcs_clear(loaded_vmcs); > > > + free_vmcs(loaded_vmcs->vmcs); > > > + loaded_vmcs->vmcs = NULL; > > > +} > > > > prefer to not do cleanup work through loaded_vmcs since it's just a pointer > > to a loaded vmcs structure. Though you can carefully arrange the nested > > vmcs cleanup happening before it, it's not very clean and a bit error prone > > simply from this function itself. It's clearer to directly cleanup vmcs01, and > > if you want an assertion could be added to make sure loaded_vmcs doesn't > > point to any stale vmcs02 structure after nested cleanup step. > > I'm afraid I didn't understand what you meant here... Basically, this > free_loaded_vmcs() is just a shortcut for loaded_vmcs_clear() and free_vmcs(), > as doing both is needed in 3 places: nested_free_vmcs02, > nested_free_all_saved_vmcss, vmx_free_vcpu. The same function is needed > for both vmcs01 and vmcs02 VMCSs - in both cases when we don't need them > any > more we need to VMCLEAR them and then free the VMCS memory. Note that > this > function does *not* free the loaded_vmcs structure itself. > > What's wrong with this? > Would you prefer that I remove this function and explictly call > loaded_vmcs_clear() and then free_vmcs() in all three places? > Forgot about it. I originally thought this was only used to free vmcs01, and thus wanted to make that purpose obvious. Thanks Kevin -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 05/24/2011 11:20 AM, Tian, Kevin wrote: > > > > The (vmx->cpu.cpu != cpu) case in __loaded_vmcs_clear should ideally never > > happen: In the cpu offline path, we only call it for the loaded_vmcss which > > we know for sure are loaded on the current cpu. In the cpu migration path, > > loaded_vmcs_clear runs __loaded_vmcs_clear on the right CPU, which ensures > > that > > equality. > > > > But, there can be a race condition (this was actually explained to me a while > > back by Avi - I never seen this happening in practice): Imagine that cpu > > migration calls loaded_vmcs_clear, which tells the old cpu (via IPI) to > > VMCLEAR this vmcs. But before that old CPU gets a chance to act on that IPI, > > a decision is made to take it offline, and all loaded_vmcs loaded on it > > (including the one in question) are cleared. When that CPU acts on this IPI, > > it notices that vmx->cpu.cpu==-1, i.e., != cpu, so it doesn't need to do > > anything (in the new version of the code, I made this more explicit, by > > returning immediately in this case). > > the reverse also holds true. Right between the point where cpu_offline hits > a loaded_vmcs and the point where it calls __loaded_vmcs_clear, it's possible > that the vcpu is migrated to another cpu, and it's likely that migration path > (vmx_vcpu_load) has invoked loaded_vmcs_clear but hasn't delete this vmcs > from old cpu's linked list. This way later when __loaded_vmcs_clear is > invoked on the offlined cpu, there's still chance to observe cpu as -1. I don't think it's possible. Both calls are done with interrupts disabled.
> From: Avi Kivity [mailto:avi@redhat.com] > Sent: Tuesday, May 24, 2011 7:06 PM > > On 05/24/2011 11:20 AM, Tian, Kevin wrote: > > > > > > The (vmx->cpu.cpu != cpu) case in __loaded_vmcs_clear should ideally > never > > > happen: In the cpu offline path, we only call it for the loaded_vmcss which > > > we know for sure are loaded on the current cpu. In the cpu migration > path, > > > loaded_vmcs_clear runs __loaded_vmcs_clear on the right CPU, which > ensures > > > that > > > equality. > > > > > > But, there can be a race condition (this was actually explained to me a > while > > > back by Avi - I never seen this happening in practice): Imagine that cpu > > > migration calls loaded_vmcs_clear, which tells the old cpu (via IPI) to > > > VMCLEAR this vmcs. But before that old CPU gets a chance to act on that > IPI, > > > a decision is made to take it offline, and all loaded_vmcs loaded on it > > > (including the one in question) are cleared. When that CPU acts on this > IPI, > > > it notices that vmx->cpu.cpu==-1, i.e., != cpu, so it doesn't need to do > > > anything (in the new version of the code, I made this more explicit, by > > > returning immediately in this case). > > > > the reverse also holds true. Right between the point where cpu_offline hits > > a loaded_vmcs and the point where it calls __loaded_vmcs_clear, it's possible > > that the vcpu is migrated to another cpu, and it's likely that migration path > > (vmx_vcpu_load) has invoked loaded_vmcs_clear but hasn't delete this vmcs > > from old cpu's linked list. This way later when __loaded_vmcs_clear is > > invoked on the offlined cpu, there's still chance to observe cpu as -1. > > I don't think it's possible. Both calls are done with interrupts disabled. If that's the case then there's another potential issue. Deadlock may happen when calling smp_call_function_single with interrupt disabled. Thanks Kevin -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 05/24/2011 02:20 PM, Tian, Kevin wrote: > > I don't think it's possible. Both calls are done with interrupts disabled. > > If that's the case then there's another potential issue. Deadlock may happen > when calling smp_call_function_single with interrupt disabled. We don't do that. vcpu migration calls vcpu_clear() with interrupts enabled, which then calls smp_call_function_single(), which calls __vcpu_clear() with interrupts disabled. vmclear_local_vcpus() is called from interrupts disabled (and calls __vcpu_clear() directly).
> From: Avi Kivity [mailto:avi@redhat.com] > Sent: Tuesday, May 24, 2011 7:27 PM > > On 05/24/2011 02:20 PM, Tian, Kevin wrote: > > > I don't think it's possible. Both calls are done with interrupts disabled. > > > > If that's the case then there's another potential issue. Deadlock may happen > > when calling smp_call_function_single with interrupt disabled. > > We don't do that. vcpu migration calls vcpu_clear() with interrupts > enabled, which then calls smp_call_function_single(), which calls > __vcpu_clear() with interrupts disabled. vmclear_local_vcpus() is > called from interrupts disabled (and calls __vcpu_clear() directly). > OK, that's clear to me now. Thanks Kevin -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 05/24/2011 02:30 PM, Tian, Kevin wrote: > > > > We don't do that. vcpu migration calls vcpu_clear() with interrupts > > enabled, which then calls smp_call_function_single(), which calls > > __vcpu_clear() with interrupts disabled. vmclear_local_vcpus() is > > called from interrupts disabled (and calls __vcpu_clear() directly). > > > > OK, that's clear to me now. Are there still open issues about the patch? (Nadav, please post patches in the future in new threads so they're easier to find)
> From: Avi Kivity [mailto:avi@redhat.com] > Sent: Tuesday, May 24, 2011 7:37 PM > > On 05/24/2011 02:30 PM, Tian, Kevin wrote: > > > > > > We don't do that. vcpu migration calls vcpu_clear() with interrupts > > > enabled, which then calls smp_call_function_single(), which calls > > > __vcpu_clear() with interrupts disabled. vmclear_local_vcpus() is > > > called from interrupts disabled (and calls __vcpu_clear() directly). > > > > > > > OK, that's clear to me now. > > Are there still open issues about the patch? > > (Nadav, please post patches in the future in new threads so they're > easier to find) > I'm fine with this patch except that Nadav needs to clarify the comment in __loaded_vmcs_clear (regarding to 'cpu migration' and 'cpu offline' part which I replied in another mail) Thanks Kevin -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, May 24, 2011, Tian, Kevin wrote about "RE: [PATCH 08/31] nVMX: Fix local_vcpus_link handling": > I'm fine with this patch except that Nadav needs to clarify the comment > in __loaded_vmcs_clear (regarding to 'cpu migration' and 'cpu offline' part > which I replied in another mail) I added a single letter, "v", to my comment ;-) Please note that the same code existed previously, and didn't have any comment. If you find this short comment more confusing (or wrong) than helpful, then I can just remove it. Avi, I'll send a new version of patch 1 in a few minutes, in a new thread this time ;-) Please let me know when (or if) you are prepared to apply the rest of the patches, so I can send a new version, rebased to the current trunk and with all the fixes people asked for in the last few days.
difference, but better safe than sorry. I had to restructure some of the code a bit to be able to properly use this new function (in 3 places - __loaded_vmcs_clear, nested_get_current_vmcs02, vmx_create_cpu). > Please repost separately after the fix, I'd like to apply it before the > rest of the series. I am adding a new version of this patch at the end of this mail. > (regarding interrupts, I think we can do that work post-merge. But I'd > like to see Kevin's comments addressed) I replied to his comments. Done some of the things he asked, and asked for more info on why/where he believes the current code is incorrect where I didn't understand what problems he pointed to, and am now waiting for him to reply. ------- 8< ------ 8< ---------- 8< ---------- 8< ----------- 8< ----------- Subject: [PATCH 01/31] nVMX: Keep list of loaded VMCSs, instead of vcpus. In VMX, before we bring down a CPU we must VMCLEAR all VMCSs loaded on it because (at least in theory) the processor might not have written all of its content back to memory. Since a patch from June 26, 2008, this is done using a per-cpu "vcpus_on_cpu" linked list of vcpus loaded on each CPU. The problem is that with nested VMX, we no longer have the concept of a vcpu being loaded on a cpu: A vcpu has multiple VMCSs (one for L1, a pool for L2s), and each of those may be have been last loaded on a different cpu. So instead of linking the vcpus, we link the VMCSs, using a new structure loaded_vmcs. This structure contains the VMCS, and the information pertaining to its loading on a specific cpu (namely, the cpu number, and whether it was already launched on this cpu once). In nested we will also use the same structure to hold L2 VMCSs, and vmx->loaded_vmcs is a pointer to the currently active VMCS. Signed-off-by: Nadav Har'El <nyh@il.ibm.com> --- arch/x86/kvm/vmx.c | 150 ++++++++++++++++++++++++------------------- 1 file changed, 86 insertions(+), 64 deletions(-) --- .before/arch/x86/kvm/vmx.c 2011-05-23 21:46:14.000000000 +0300 +++ .after/arch/x86/kvm/vmx.c 2011-05-23 21:46:14.000000000 +0300 @@ -116,6 +116,18 @@ struct vmcs { char data[0]; }; +/* + * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also + * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs + * loaded on this CPU (so we can clear them if the CPU goes down). + */ +struct loaded_vmcs { + struct vmcs *vmcs; + int cpu; + int launched; + struct list_head loaded_vmcss_on_cpu_link; +}; + struct shared_msr_entry { unsigned index; u64 data; @@ -124,9 +136,7 @@ struct shared_msr_entry { struct vcpu_vmx { struct kvm_vcpu vcpu; - struct list_head local_vcpus_link; unsigned long host_rsp; - int launched; u8 fail; u8 cpl; bool nmi_known_unmasked; @@ -140,7 +150,14 @@ struct vcpu_vmx { u64 msr_host_kernel_gs_base; u64 msr_guest_kernel_gs_base; #endif - struct vmcs *vmcs; + /* + * loaded_vmcs points to the VMCS currently used in this vcpu. For a + * non-nested (L1) guest, it always points to vmcs01. For a nested + * guest (L2), it points to a different VMCS. + */ + struct loaded_vmcs vmcs01; + struct loaded_vmcs *loaded_vmcs; + bool __launched; /* temporary, used in vmx_vcpu_run */ struct msr_autoload { unsigned nr; struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS]; @@ -200,7 +217,11 @@ static int vmx_set_tss_addr(struct kvm * static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, current_vmcs); -static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu); +/* + * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed + * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it. + */ +static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu); static DEFINE_PER_CPU(struct desc_ptr, host_gdt); static unsigned long *vmx_io_bitmap_a; @@ -501,6 +522,13 @@ static void vmcs_clear(struct vmcs *vmcs vmcs, phys_addr); } +static inline void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs) +{ + vmcs_clear(loaded_vmcs->vmcs); + loaded_vmcs->cpu = -1; + loaded_vmcs->launched = 0; +} + static void vmcs_load(struct vmcs *vmcs) { u64 phys_addr = __pa(vmcs); @@ -514,25 +542,24 @@ static void vmcs_load(struct vmcs *vmcs) vmcs, phys_addr); } -static void __vcpu_clear(void *arg) +static void __loaded_vmcs_clear(void *arg) { - struct vcpu_vmx *vmx = arg; + struct loaded_vmcs *loaded_vmcs = arg; int cpu = raw_smp_processor_id(); - if (vmx->vcpu.cpu == cpu) - vmcs_clear(vmx->vmcs); - if (per_cpu(current_vmcs, cpu) == vmx->vmcs) + if (loaded_vmcs->cpu != cpu) + return; /* cpu migration can race with cpu offline */ + if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs) per_cpu(current_vmcs, cpu) = NULL; - list_del(&vmx->local_vcpus_link); - vmx->vcpu.cpu = -1; - vmx->launched = 0; + list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link); + loaded_vmcs_init(loaded_vmcs); } -static void vcpu_clear(struct vcpu_vmx *vmx) +static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) { - if (vmx->vcpu.cpu == -1) - return; - smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 1); + if (loaded_vmcs->cpu != -1) + smp_call_function_single( + loaded_vmcs->cpu, __loaded_vmcs_clear, loaded_vmcs, 1); } static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx) @@ -971,22 +998,22 @@ static void vmx_vcpu_load(struct kvm_vcp if (!vmm_exclusive) kvm_cpu_vmxon(phys_addr); - else if (vcpu->cpu != cpu) - vcpu_clear(vmx); + else if (vmx->loaded_vmcs->cpu != cpu) + loaded_vmcs_clear(vmx->loaded_vmcs); - if (per_cpu(current_vmcs, cpu) != vmx->vmcs) { - per_cpu(current_vmcs, cpu) = vmx->vmcs; - vmcs_load(vmx->vmcs); + if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) { + per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs; + vmcs_load(vmx->loaded_vmcs->vmcs); } - if (vcpu->cpu != cpu) { + if (vmx->loaded_vmcs->cpu != cpu) { struct desc_ptr *gdt = &__get_cpu_var(host_gdt); unsigned long sysenter_esp; kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); local_irq_disable(); - list_add(&vmx->local_vcpus_link, - &per_cpu(vcpus_on_cpu, cpu)); + list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link, + &per_cpu(loaded_vmcss_on_cpu, cpu)); local_irq_enable(); /* @@ -998,6 +1025,7 @@ static void vmx_vcpu_load(struct kvm_vcp rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ + vmx->loaded_vmcs->cpu = cpu; } } @@ -1005,7 +1033,8 @@ static void vmx_vcpu_put(struct kvm_vcpu { __vmx_load_host_state(to_vmx(vcpu)); if (!vmm_exclusive) { - __vcpu_clear(to_vmx(vcpu)); + __loaded_vmcs_clear(to_vmx(vcpu)->loaded_vmcs); + vcpu->cpu = -1; kvm_cpu_vmxoff(); } } @@ -1469,7 +1498,7 @@ static int hardware_enable(void *garbage if (read_cr4() & X86_CR4_VMXE) return -EBUSY; - INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu)); + INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); rdmsrl(MSR_IA32_FEATURE_CONTROL, old); test_bits = FEATURE_CONTROL_LOCKED; @@ -1493,14 +1522,14 @@ static int hardware_enable(void *garbage return 0; } -static void vmclear_local_vcpus(void) +static void vmclear_local_loaded_vmcss(void) { int cpu = raw_smp_processor_id(); - struct vcpu_vmx *vmx, *n; + struct loaded_vmcs *v, *n; - list_for_each_entry_safe(vmx, n, &per_cpu(vcpus_on_cpu, cpu), - local_vcpus_link) - __vcpu_clear(vmx); + list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu), + loaded_vmcss_on_cpu_link) + __loaded_vmcs_clear(v); } @@ -1515,7 +1544,7 @@ static void kvm_cpu_vmxoff(void) static void hardware_disable(void *garbage) { if (vmm_exclusive) { - vmclear_local_vcpus(); + vmclear_local_loaded_vmcss(); kvm_cpu_vmxoff(); } write_cr4(read_cr4() & ~X86_CR4_VMXE); @@ -1696,6 +1725,18 @@ static void free_vmcs(struct vmcs *vmcs) free_pages((unsigned long)vmcs, vmcs_config.order); } +/* + * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded + */ +static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) +{ + if (!loaded_vmcs->vmcs) + return; + loaded_vmcs_clear(loaded_vmcs); + free_vmcs(loaded_vmcs->vmcs); + loaded_vmcs->vmcs = NULL; +} + static void free_kvm_area(void) { int cpu; @@ -4166,6 +4207,7 @@ static void __noclone vmx_vcpu_run(struc if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) vmx_set_interrupt_shadow(vcpu, 0); + vmx->__launched = vmx->loaded_vmcs->launched; asm( /* Store host registers */ "push %%"R"dx; push %%"R"bp;" @@ -4236,7 +4278,7 @@ static void __noclone vmx_vcpu_run(struc "pop %%"R"bp; pop %%"R"dx \n\t" "setbe %c[fail](%0) \n\t" : : "c"(vmx), "d"((unsigned long)HOST_RSP), - [launched]"i"(offsetof(struct vcpu_vmx, launched)), + [launched]"i"(offsetof(struct vcpu_vmx, __launched)), [fail]"i"(offsetof(struct vcpu_vmx, fail)), [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)), [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), @@ -4276,7 +4318,7 @@ static void __noclone vmx_vcpu_run(struc vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); - vmx->launched = 1; + vmx->loaded_vmcs->launched = 1; vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); @@ -4288,41 +4330,17 @@ static void __noclone vmx_vcpu_run(struc #undef R #undef Q -static void vmx_free_vmcs(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (vmx->vmcs) { - vcpu_clear(vmx); - free_vmcs(vmx->vmcs); - vmx->vmcs = NULL; - } -} - static void vmx_free_vcpu(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); free_vpid(vmx); - vmx_free_vmcs(vcpu); + free_loaded_vmcs(vmx->loaded_vmcs); kfree(vmx->guest_msrs); kvm_vcpu_uninit(vcpu); kmem_cache_free(kvm_vcpu_cache, vmx); } -static inline void vmcs_init(struct vmcs *vmcs) -{ - u64 phys_addr = __pa(per_cpu(vmxarea, raw_smp_processor_id())); - - if (!vmm_exclusive) - kvm_cpu_vmxon(phys_addr); - - vmcs_clear(vmcs); - - if (!vmm_exclusive) - kvm_cpu_vmxoff(); -} - static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) { int err; @@ -4344,11 +4362,15 @@ static struct kvm_vcpu *vmx_create_vcpu( goto uninit_vcpu; } - vmx->vmcs = alloc_vmcs(); - if (!vmx->vmcs) + vmx->loaded_vmcs = &vmx->vmcs01; + vmx->loaded_vmcs->vmcs = alloc_vmcs(); + if (!vmx->loaded_vmcs->vmcs) goto free_msrs; - - vmcs_init(vmx->vmcs); + if (!vmm_exclusive) + kvm_cpu_vmxon(__pa(per_cpu(vmxarea, raw_smp_processor_id()))); + loaded_vmcs_init(vmx->loaded_vmcs); + if (!vmm_exclusive) + kvm_cpu_vmxoff(); cpu = get_cpu(); vmx_vcpu_load(&vmx->vcpu, cpu); @@ -4377,7 +4399,7 @@ static struct kvm_vcpu *vmx_create_vcpu( return &vmx->vcpu; free_vmcs: - free_vmcs(vmx->vmcs); + free_vmcs(vmx->loaded_vmcs->vmcs); free_msrs: kfree(vmx->guest_msrs); uninit_vcpu: