From patchwork Tue Jan 5 14:12:46 2010 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Gleb Natapov X-Patchwork-Id: 71019 Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by demeter.kernel.org (8.14.3/8.14.2) with ESMTP id o05EFnYd011659 for ; Tue, 5 Jan 2010 14:15:49 GMT Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1754663Ab0AEONy (ORCPT ); Tue, 5 Jan 2010 09:13:54 -0500 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S1754615Ab0AEONv (ORCPT ); Tue, 5 Jan 2010 09:13:51 -0500 Received: from mx1.redhat.com ([209.132.183.28]:16409 "EHLO mx1.redhat.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1754653Ab0AEONt (ORCPT ); Tue, 5 Jan 2010 09:13:49 -0500 Received: from int-mx04.intmail.prod.int.phx2.redhat.com (int-mx04.intmail.prod.int.phx2.redhat.com [10.5.11.17]) by mx1.redhat.com (8.13.8/8.13.8) with ESMTP id o05ECvTF021401 (version=TLSv1/SSLv3 cipher=DHE-RSA-AES256-SHA bits=256 verify=OK); Tue, 5 Jan 2010 09:12:57 -0500 Received: from dhcp-1-237.tlv.redhat.com (dhcp-1-237.tlv.redhat.com [10.35.1.237]) by int-mx04.intmail.prod.int.phx2.redhat.com (8.13.8/8.13.8) with ESMTP id o05ECt7Z006709; Tue, 5 Jan 2010 09:12:56 -0500 Received: by dhcp-1-237.tlv.redhat.com (Postfix, from userid 13519) id 0A2731336D1; Tue, 5 Jan 2010 16:12:55 +0200 (IST) From: Gleb Natapov To: kvm@vger.kernel.org Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org, avi@redhat.com, mingo@elte.hu, a.p.zijlstra@chello.nl, tglx@linutronix.de, hpa@zytor.com, riel@redhat.com, cl@linux-foundation.org Subject: [PATCH v3 04/12] Add "handle page fault" PV helper. Date: Tue, 5 Jan 2010 16:12:46 +0200 Message-Id: <1262700774-1808-5-git-send-email-gleb@redhat.com> In-Reply-To: <1262700774-1808-1-git-send-email-gleb@redhat.com> References: <1262700774-1808-1-git-send-email-gleb@redhat.com> X-Scanned-By: MIMEDefang 2.67 on 10.5.11.17 Sender: kvm-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: kvm@vger.kernel.org diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index 56ca41b..98edaa9 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -51,6 +51,9 @@ struct kvm_mmu_op_release_pt { __u64 pt_phys; }; +#define KVM_PV_REASON_PAGE_NOT_PRESENT 1 +#define KVM_PV_REASON_PAGE_READY 2 + struct kvm_vcpu_pv_apf_data { __u32 reason; __u32 enabled; diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index dd59a85..0c3e95b 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -6,6 +6,7 @@ #ifdef CONFIG_PARAVIRT #include #include +#include #include @@ -699,15 +700,21 @@ static inline void pmd_clear(pmd_t *pmdp) } #endif /* CONFIG_X86_PAE */ +static inline void arch_end_context_switch(struct task_struct *next) +{ + PVOP_VCALL1(pv_cpu_ops.end_context_switch, next); +} + #define __HAVE_ARCH_START_CONTEXT_SWITCH static inline void arch_start_context_switch(struct task_struct *prev) { PVOP_VCALL1(pv_cpu_ops.start_context_switch, prev); } -static inline void arch_end_context_switch(struct task_struct *next) +static inline int handle_page_fault(struct pt_regs *regs, + unsigned long error_code) { - PVOP_VCALL1(pv_cpu_ops.end_context_switch, next); + return PVOP_CALL2(int, pv_cpu_ops.handle_pf, regs, error_code); } #define __HAVE_ARCH_ENTER_LAZY_MMU_MODE diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index b1e70d5..1aec9b5 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -186,6 +186,7 @@ struct pv_cpu_ops { void (*start_context_switch)(struct task_struct *prev); void (*end_context_switch)(struct task_struct *next); + int (*handle_pf)(struct pt_regs *regs, unsigned long error_code); }; struct pv_irq_ops { @@ -385,6 +386,7 @@ extern struct pv_lock_ops pv_lock_ops; unsigned paravirt_patch_nop(void); unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len); unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len); +unsigned paravirt_patch_ret_0(void *insnbuf, unsigned len); unsigned paravirt_patch_ignore(unsigned len); unsigned paravirt_patch_call(void *insnbuf, const void *target, u16 tgt_clobbers, @@ -676,8 +678,10 @@ void paravirt_leave_lazy_mmu(void); void _paravirt_nop(void); u32 _paravirt_ident_32(u32); u64 _paravirt_ident_64(u64); +unsigned long _paravirt_ret_0(void); #define paravirt_nop ((void *)_paravirt_nop) +#define paravirt_ret_0 ((void *)_paravirt_ret_0) /* These all sit in the .parainstructions section to tell us what to patch. */ struct paravirt_patch_site { diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 001222c..2245f35 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -29,6 +29,8 @@ #include #include #include +#include +#include #include #include @@ -54,6 +56,159 @@ static void kvm_io_delay(void) { } +#define KVM_TASK_SLEEP_HASHBITS 8 +#define KVM_TASK_SLEEP_HASHSIZE (1<list) { + struct kvm_task_sleep_node *n = + hlist_entry(p, typeof(*n), link); + if (n->token == token) + return n; + } + + return NULL; +} + +static void apf_task_wait(struct task_struct *tsk, u32 token) +{ + u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS); + struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; + struct kvm_task_sleep_node n, *e; + DEFINE_WAIT(wait); + + spin_lock(&b->lock); + e = _find_apf_task(b, token); + if (e) { + /* dummy entry exist -> wake up was delivered ahead of PF */ + hlist_del(&e->link); + kfree(e); + spin_unlock(&b->lock); + return; + } + + n.token = token; + n.cpu = smp_processor_id(); + init_waitqueue_head(&n.wq); + hlist_add_head(&n.link, &b->list); + spin_unlock(&b->lock); + + for (;;) { + prepare_to_wait(&n.wq, &wait, TASK_UNINTERRUPTIBLE); + if (hlist_unhashed(&n.link)) + break; + schedule(); + } + finish_wait(&n.wq, &wait); + + return; +} + +static void apf_task_wake_one(struct kvm_task_sleep_node *n) +{ + hlist_del_init(&n->link); + if (waitqueue_active(&n->wq)) + wake_up(&n->wq); +} + +static void apf_task_wake(u32 token) +{ + u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS); + struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; + struct kvm_task_sleep_node *n; + +again: + spin_lock(&b->lock); + n = _find_apf_task(b, token); + if (!n) { + /* + * async PF was not yet handled. + * Add dummy entry for the token. + */ + n = kmalloc(sizeof(*n), GFP_ATOMIC); + if (!n) { + /* + * Allocation failed! Busy wait while other vcpu + * handles async PF. + */ + spin_unlock(&b->lock); + cpu_relax(); + goto again; + } + n->token = token; + n->cpu = smp_processor_id(); + init_waitqueue_head(&n->wq); + hlist_add_head(&n->link, &b->list); + } else + apf_task_wake_one(n); + spin_unlock(&b->lock); + return; +} + +static void apf_task_wake_all(void) +{ + int i; + + for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) { + struct hlist_node *p, *next; + struct kvm_task_sleep_head *b = &async_pf_sleepers[i]; + spin_lock(&b->lock); + hlist_for_each_safe(p, next, &b->list) { + struct kvm_task_sleep_node *n = + hlist_entry(p, typeof(*n), link); + if (n->cpu == smp_processor_id()) + apf_task_wake_one(n); + } + spin_unlock(&b->lock); + } +} + +int kvm_handle_pf(struct pt_regs *regs, unsigned long error_code) +{ + u32 reason, token; + + if (!__get_cpu_var(apf_reason).enabled) + return 0; + + reason = __get_cpu_var(apf_reason).reason; + __get_cpu_var(apf_reason).reason = 0; + + token = (u32)read_cr2(); + + switch (reason) { + default: + return 0; + case KVM_PV_REASON_PAGE_NOT_PRESENT: + /* page is swapped out by the host. */ + apf_task_wait(current, token); + break; + case KVM_PV_REASON_PAGE_READY: + if (unlikely(token == ~0)) + apf_task_wake_all(); + else + apf_task_wake(token); + break; + } + + return 1; +} + static void kvm_mmu_op(void *buffer, unsigned len) { int r; @@ -207,6 +362,9 @@ static void __init paravirt_ops_setup(void) if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY)) pv_cpu_ops.io_delay = kvm_io_delay; + if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF)) + pv_cpu_ops.handle_pf = kvm_handle_pf; + if (kvm_para_has_feature(KVM_FEATURE_MMU_OP)) { pv_mmu_ops.set_pte = kvm_set_pte; pv_mmu_ops.set_pte_at = kvm_set_pte_at; @@ -270,11 +428,15 @@ static void __init kvm_smp_prepare_boot_cpu(void) void __init kvm_guest_init(void) { + int i; + if (!kvm_para_available()) return; paravirt_ops_setup(); register_reboot_notifier(&kvm_pv_reboot_nb); + for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) + spin_lock_init(&async_pf_sleepers[i].lock); #ifdef CONFIG_SMP smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; #else diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 1b1739d..7d8f37b 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -54,6 +54,11 @@ u64 _paravirt_ident_64(u64 x) return x; } +unsigned long _paravirt_ret_0(void) +{ + return 0; +} + void __init default_banner(void) { printk(KERN_INFO "Booting paravirtualized kernel on %s\n", @@ -154,6 +159,8 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf, ret = paravirt_patch_ident_32(insnbuf, len); else if (opfunc == _paravirt_ident_64) ret = paravirt_patch_ident_64(insnbuf, len); + else if (opfunc == _paravirt_ret_0) + ret = paravirt_patch_ret_0(insnbuf, len); else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) || type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit) || @@ -380,6 +387,7 @@ struct pv_cpu_ops pv_cpu_ops = { .start_context_switch = paravirt_nop, .end_context_switch = paravirt_nop, + .handle_pf = paravirt_ret_0, }; struct pv_apic_ops pv_apic_ops = { diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c index d9f32e6..de006b1 100644 --- a/arch/x86/kernel/paravirt_patch_32.c +++ b/arch/x86/kernel/paravirt_patch_32.c @@ -12,6 +12,8 @@ DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax"); DEF_NATIVE(pv_cpu_ops, clts, "clts"); DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc"); +DEF_NATIVE(, mov0, "xor %eax, %eax"); + unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len) { /* arg in %eax, return in %eax */ @@ -24,6 +26,12 @@ unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len) return 0; } +unsigned paravirt_patch_ret_0(void *insnbuf, unsigned len) +{ + return paravirt_patch_insns(insnbuf, len, + start__mov0, end__mov0); +} + unsigned native_patch(u8 type, u16 clobbers, void *ibuf, unsigned long addr, unsigned len) { diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c index 3f08f34..d685e7d 100644 --- a/arch/x86/kernel/paravirt_patch_64.c +++ b/arch/x86/kernel/paravirt_patch_64.c @@ -21,6 +21,7 @@ DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs"); DEF_NATIVE(, mov32, "mov %edi, %eax"); DEF_NATIVE(, mov64, "mov %rdi, %rax"); +DEF_NATIVE(, mov0, "xor %rax, %rax"); unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len) { @@ -34,6 +35,12 @@ unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len) start__mov64, end__mov64); } +unsigned paravirt_patch_ret_0(void *insnbuf, unsigned len) +{ + return paravirt_patch_insns(insnbuf, len, + start__mov0, end__mov0); +} + unsigned native_patch(u8 type, u16 clobbers, void *ibuf, unsigned long addr, unsigned len) { diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index f627779..7abc3ee 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -953,6 +953,9 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) int write; int fault; + if (handle_page_fault(regs, error_code)) + return; + tsk = current; mm = tsk->mm;