diff mbox

[RFC] kvm: Make KVM DF intercept configurable

Message ID 20160301192822.GD22677@pd.tnic (mailing list archive)
State New, archived
Headers show

Commit Message

Borislav Petkov March 1, 2016, 7:28 p.m. UTC
Hi Paolo,

so I've been hacking at early code recently and found this to be pretty
useful. Thoughts?

This is AMD-only now but I'll extend it to Intel too, if there's
interest.

---
From: Borislav Petkov <bp@suse.de>
Date: Tue, 1 Mar 2016 14:34:28 +0100
Subject: [RFC PATCH] kvm: Make KVM DF intercept configurable

Sometimes it is useful when testing kernels in guests to know why the
guest triple-faults, especially if one is poking at early kernel code
where there's no debugging output yet.

So add a module parameter which makes the guest exit on a #DF and log
error info into a #DF tracepoint. Which turns out to be very useful:

1. Early PF causing a DF:
-------------------------

Comments

Paolo Bonzini March 1, 2016, 9:11 p.m. UTC | #1
On 01/03/2016 20:28, Borislav Petkov wrote:
> Hi Paolo,
> 
> so I've been hacking at early code recently and found this to be pretty
> useful. Thoughts?

I just use QEMU's binary translation mode to debug this kind of code (-d
in_asm is useful and relatively compact (because it only shows loops
once), or alternatively ept=0.  But perhaps... why not. :)  It's not
like it adds overhead.

Paolo

> This is AMD-only now but I'll extend it to Intel too, if there's
> interest.
> 
> ---
> From: Borislav Petkov <bp@suse.de>
> Date: Tue, 1 Mar 2016 14:34:28 +0100
> Subject: [RFC PATCH] kvm: Make KVM DF intercept configurable
> 
> Sometimes it is useful when testing kernels in guests to know why the
> guest triple-faults, especially if one is poking at early kernel code
> where there's no debugging output yet.
> 
> So add a module parameter which makes the guest exit on a #DF and log
> error info into a #DF tracepoint. Which turns out to be very useful:
> 
> 1. Early PF causing a DF:
> -------------------------
> 
> diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
> index 22fbf9df61bb..57d7d7a68c7d 100644
> --- a/arch/x86/kernel/head_64.S
> +++ b/arch/x86/kernel/head_64.S
> @@ -64,6 +64,8 @@ startup_64:
>          * tables and then reload them.
>          */
> 
> +       movq stack_start, %rsp
> ---
> 
> We can't read that early from stack_start because we haven't enabled
> paging yet. KVM logs:
> 
>  qemu-system-x86-9523  [001] ....  1941.666236: kvm_df: Guest DF! rIP: 0x1000000, prev exception: 0xe, error_code: 0x0
> 
> and the exception causing the #DF is a #PF (vector 0xe).
> 
> 2. #GP
> -------------------------
> 
> diff --git a/init/main.c b/init/main.c
> index 7c27de4577ed..869eb6110765 100644
> --- a/init/main.c
> +++ b/init/main.c
> @@ -967,6 +969,9 @@ static int __ref kernel_init(void *unused)
>                        ramdisk_execute_command, ret);
>         }
> 
> +       load_idt(&no_idt);
> +       __asm__ __volatile__("int3");
> ---
> 
> Loading a NULL IDT and causing a debug interrupt, causes a #GP.
> 
> KVM logs:
> 
>  qemu-system-x86-12853 [000] ....  2463.500732: kvm_df: Guest DF! rIP: 0xffffffff815ba8ba, prev exception: 0xd, error_code: 0x1a
> 
> and the previous exception was a #GP (vector 0xd).
> 
> Signed-off-by: Borislav Petkov <bp@suse.de>
> ---
>  arch/x86/kvm/svm.c   | 15 +++++++++++++++
>  arch/x86/kvm/trace.h | 20 ++++++++++++++++++++
>  arch/x86/kvm/x86.c   |  1 +
>  3 files changed, 36 insertions(+)
> 
> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> index c13a64b7d789..09f78bbd8b47 100644
> --- a/arch/x86/kvm/svm.c
> +++ b/arch/x86/kvm/svm.c
> @@ -205,6 +205,9 @@ module_param(npt, int, S_IRUGO);
>  static int nested = true;
>  module_param(nested, int, S_IRUGO);
>  
> +static int intercept_df = 0;
> +module_param(intercept_df, int, S_IRUGO);
> +
>  static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
>  static void svm_flush_tlb(struct kvm_vcpu *vcpu);
>  static void svm_complete_interrupts(struct vcpu_svm *svm);
> @@ -1023,6 +1026,8 @@ static void init_vmcb(struct vcpu_svm *svm)
>  	set_exception_intercept(svm, MC_VECTOR);
>  	set_exception_intercept(svm, AC_VECTOR);
>  	set_exception_intercept(svm, DB_VECTOR);
> +	if (intercept_df)
> +		set_exception_intercept(svm, DF_VECTOR);
>  
>  	set_intercept(svm, INTERCEPT_INTR);
>  	set_intercept(svm, INTERCEPT_NMI);
> @@ -1728,6 +1733,15 @@ static int nm_interception(struct vcpu_svm *svm)
>  	return 1;
>  }
>  
> +static int df_interception(struct vcpu_svm *svm)
> +{
> +	trace_kvm_df(svm->vmcb->save.rip,
> +		     svm->vcpu.arch.exception.nr,
> +		     svm->vcpu.arch.exception.has_error_code ?
> +		     svm->vcpu.arch.exception.error_code : 0);
> +	return 1;
> +}
> +
>  static bool is_erratum_383(void)
>  {
>  	int err, i;
> @@ -3308,6 +3322,7 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
>  	[SVM_EXIT_EXCP_BASE + UD_VECTOR]	= ud_interception,
>  	[SVM_EXIT_EXCP_BASE + PF_VECTOR]	= pf_interception,
>  	[SVM_EXIT_EXCP_BASE + NM_VECTOR]	= nm_interception,
> +	[SVM_EXIT_EXCP_BASE + DF_VECTOR]	= df_interception,
>  	[SVM_EXIT_EXCP_BASE + MC_VECTOR]	= mc_interception,
>  	[SVM_EXIT_EXCP_BASE + AC_VECTOR]	= ac_interception,
>  	[SVM_EXIT_INTR]				= intr_interception,
> diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
> index ad9f6a23f139..6143c53f7bde 100644
> --- a/arch/x86/kvm/trace.h
> +++ b/arch/x86/kvm/trace.h
> @@ -89,6 +89,26 @@ TRACE_EVENT(kvm_hv_hypercall,
>  		  __entry->outgpa)
>  );
>  
> +TRACE_EVENT(kvm_df,
> +	TP_PROTO(unsigned long rip, u8 nr, u32 error_code),
> +	TP_ARGS(rip, nr, error_code),
> +
> +	TP_STRUCT__entry(
> +		__field(	unsigned long, rip		)
> +		__field(		   u8, nr		)
> +		__field(		  u32, error_code	)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->rip		= rip;
> +		__entry->nr		= nr;
> +		__entry->error_code	= error_code;
> +	),
> +
> +	TP_printk("Guest DF! rIP: 0x%lx, prev exception: 0x%x, error_code: 0x%x",
> +		  __entry->rip, __entry->nr, __entry->error_code)
> +);
> +
>  /*
>   * Tracepoint for PIO.
>   */
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index f18a08281015..49f1bd3279f2 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -8388,3 +8388,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset);
>  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window);
>  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full);
>  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update);
> +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_df);
> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Borislav Petkov March 1, 2016, 10:41 p.m. UTC | #2
On Tue, Mar 01, 2016 at 10:11:57PM +0100, Paolo Bonzini wrote:
> I just use QEMU's binary translation mode to debug this kind of code (-d
> in_asm is useful and relatively compact (because it only shows loops
> once),

Ha, I had forgotten about "in_asm"! Thanks for reminding me, that's a
really cool feature I'm going to use. With it I see:

----------------
IN: 
0x0000000001000000:  mov    0xffffffff81cba1f8,%rsp
0x0000000001000008:  callq  0x10001a9

----------------

Now, it is obvious that 0xffffffff81cba1f8 is not mapped yet and we're
running from physical addresses. The DF tracepoint shows, in addition,
the previous exception vector causing the DF and I think that's useful.
As an additional debugging aid. Oh, and that doesn't need ept=0 and runs
at full speed.

> or alternatively ept=0. But perhaps... why not. :) It's not like it
> adds overhead.

Yeah, it is off by default and doesn't hurt anyone. And the diff size
is ok, IMHO. Lemme code the Intel side too and see how the whole thing
turns out.

Thanks.
Jan Kiszka March 2, 2016, 6:45 a.m. UTC | #3
On 2016-03-01 23:41, Borislav Petkov wrote:
> On Tue, Mar 01, 2016 at 10:11:57PM +0100, Paolo Bonzini wrote:
>> I just use QEMU's binary translation mode to debug this kind of code (-d
>> in_asm is useful and relatively compact (because it only shows loops
>> once),
> 
> Ha, I had forgotten about "in_asm"! Thanks for reminding me, that's a
> really cool feature I'm going to use. With it I see:
> 
> ----------------
> IN: 
> 0x0000000001000000:  mov    0xffffffff81cba1f8,%rsp
> 0x0000000001000008:  callq  0x10001a9
> 
> ----------------
> 
> Now, it is obvious that 0xffffffff81cba1f8 is not mapped yet and we're
> running from physical addresses. The DF tracepoint shows, in addition,
> the previous exception vector causing the DF and I think that's useful.
> As an additional debugging aid. Oh, and that doesn't need ept=0 and runs
> at full speed.
> 
>> or alternatively ept=0. But perhaps... why not. :) It's not like it
>> adds overhead.
> 
> Yeah, it is off by default and doesn't hurt anyone. And the diff size
> is ok, IMHO. Lemme code the Intel side too and see how the whole thing
> turns out.

To make this a serious debug feature, we should consider trapping all
exceptions on request (and reinjecting the unhandled ones). Not sure
right now, though, if that comes with more complications than the simple
#DF case.

Jan
Borislav Petkov March 2, 2016, 9:07 a.m. UTC | #4
On Wed, Mar 02, 2016 at 07:45:53AM +0100, Jan Kiszka wrote:
> To make this a serious debug feature, we should consider trapping all
> exceptions on request (and reinjecting the unhandled ones). Not sure
> right now, though, if that comes with more complications than the simple
> #DF case.

I can certainly try... Something like "intercept=<vector>,<vector>,..."
or so. Let me see how ugly it can get...

Thanks.
Paolo Bonzini March 2, 2016, 9:17 a.m. UTC | #5
On 02/03/2016 10:07, Borislav Petkov wrote:
>> > To make this a serious debug feature, we should consider trapping all
>> > exceptions on request (and reinjecting the unhandled ones). Not sure
>> > right now, though, if that comes with more complications than the simple
>> > #DF case.
> I can certainly try... Something like "intercept=<vector>,<vector>,..."
> or so. Let me see how ugly it can get...

For kernel folks, a bit mask can do. :)

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Borislav Petkov March 2, 2016, 9:22 a.m. UTC | #6
On Wed, Mar 02, 2016 at 10:17:56AM +0100, Paolo Bonzini wrote:
> For kernel folks, a bit mask can do. :)

Haha, ok. Luckily, we have only ~32 vectors (including the reserved
ones) so we should be fine.

:-)
diff mbox

Patch

diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 22fbf9df61bb..57d7d7a68c7d 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -64,6 +64,8 @@  startup_64:
         * tables and then reload them.
         */

+       movq stack_start, %rsp
---

We can't read that early from stack_start because we haven't enabled
paging yet. KVM logs:

 qemu-system-x86-9523  [001] ....  1941.666236: kvm_df: Guest DF! rIP: 0x1000000, prev exception: 0xe, error_code: 0x0

and the exception causing the #DF is a #PF (vector 0xe).

2. #GP
-------------------------

diff --git a/init/main.c b/init/main.c
index 7c27de4577ed..869eb6110765 100644
--- a/init/main.c
+++ b/init/main.c
@@ -967,6 +969,9 @@  static int __ref kernel_init(void *unused)
                       ramdisk_execute_command, ret);
        }

+       load_idt(&no_idt);
+       __asm__ __volatile__("int3");
---

Loading a NULL IDT and causing a debug interrupt, causes a #GP.

KVM logs:

 qemu-system-x86-12853 [000] ....  2463.500732: kvm_df: Guest DF! rIP: 0xffffffff815ba8ba, prev exception: 0xd, error_code: 0x1a

and the previous exception was a #GP (vector 0xd).

Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/kvm/svm.c   | 15 +++++++++++++++
 arch/x86/kvm/trace.h | 20 ++++++++++++++++++++
 arch/x86/kvm/x86.c   |  1 +
 3 files changed, 36 insertions(+)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index c13a64b7d789..09f78bbd8b47 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -205,6 +205,9 @@  module_param(npt, int, S_IRUGO);
 static int nested = true;
 module_param(nested, int, S_IRUGO);
 
+static int intercept_df = 0;
+module_param(intercept_df, int, S_IRUGO);
+
 static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
 static void svm_flush_tlb(struct kvm_vcpu *vcpu);
 static void svm_complete_interrupts(struct vcpu_svm *svm);
@@ -1023,6 +1026,8 @@  static void init_vmcb(struct vcpu_svm *svm)
 	set_exception_intercept(svm, MC_VECTOR);
 	set_exception_intercept(svm, AC_VECTOR);
 	set_exception_intercept(svm, DB_VECTOR);
+	if (intercept_df)
+		set_exception_intercept(svm, DF_VECTOR);
 
 	set_intercept(svm, INTERCEPT_INTR);
 	set_intercept(svm, INTERCEPT_NMI);
@@ -1728,6 +1733,15 @@  static int nm_interception(struct vcpu_svm *svm)
 	return 1;
 }
 
+static int df_interception(struct vcpu_svm *svm)
+{
+	trace_kvm_df(svm->vmcb->save.rip,
+		     svm->vcpu.arch.exception.nr,
+		     svm->vcpu.arch.exception.has_error_code ?
+		     svm->vcpu.arch.exception.error_code : 0);
+	return 1;
+}
+
 static bool is_erratum_383(void)
 {
 	int err, i;
@@ -3308,6 +3322,7 @@  static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
 	[SVM_EXIT_EXCP_BASE + UD_VECTOR]	= ud_interception,
 	[SVM_EXIT_EXCP_BASE + PF_VECTOR]	= pf_interception,
 	[SVM_EXIT_EXCP_BASE + NM_VECTOR]	= nm_interception,
+	[SVM_EXIT_EXCP_BASE + DF_VECTOR]	= df_interception,
 	[SVM_EXIT_EXCP_BASE + MC_VECTOR]	= mc_interception,
 	[SVM_EXIT_EXCP_BASE + AC_VECTOR]	= ac_interception,
 	[SVM_EXIT_INTR]				= intr_interception,
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index ad9f6a23f139..6143c53f7bde 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -89,6 +89,26 @@  TRACE_EVENT(kvm_hv_hypercall,
 		  __entry->outgpa)
 );
 
+TRACE_EVENT(kvm_df,
+	TP_PROTO(unsigned long rip, u8 nr, u32 error_code),
+	TP_ARGS(rip, nr, error_code),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long, rip		)
+		__field(		   u8, nr		)
+		__field(		  u32, error_code	)
+	),
+
+	TP_fast_assign(
+		__entry->rip		= rip;
+		__entry->nr		= nr;
+		__entry->error_code	= error_code;
+	),
+
+	TP_printk("Guest DF! rIP: 0x%lx, prev exception: 0x%x, error_code: 0x%x",
+		  __entry->rip, __entry->nr, __entry->error_code)
+);
+
 /*
  * Tracepoint for PIO.
  */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f18a08281015..49f1bd3279f2 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -8388,3 +8388,4 @@  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_df);