diff mbox series

[v3,2/3] KVM: x86: On emulation failure, convey the exit reason, etc. to userspace

Message ID 20210729133931.1129696-3-david.edmondson@oracle.com (mailing list archive)
State New, archived
Headers show
Series kvm: x86: Convey the exit reason, etc. to user-space on emulation failure | expand

Commit Message

David Edmondson July 29, 2021, 1:39 p.m. UTC
Should instruction emulation fail, include the VM exit reason, etc. in
the emulation_failure data passed to userspace, in order that the VMM
can report it as a debugging aid when describing the failure.

Suggested-by: Joao Martins <joao.m.martins@oracle.com>
Signed-off-by: David Edmondson <david.edmondson@oracle.com>
---
 arch/x86/include/asm/kvm_host.h |  5 ++++
 arch/x86/kvm/vmx/vmx.c          |  5 +---
 arch/x86/kvm/x86.c              | 53 ++++++++++++++++++++++++++-------
 include/uapi/linux/kvm.h        |  7 +++++
 4 files changed, 56 insertions(+), 14 deletions(-)

Comments

Sean Christopherson July 30, 2021, 10:14 p.m. UTC | #1
On Thu, Jul 29, 2021, David Edmondson wrote:
> Should instruction emulation fail, include the VM exit reason, etc. in
> the emulation_failure data passed to userspace, in order that the VMM
> can report it as a debugging aid when describing the failure.
> 
> Suggested-by: Joao Martins <joao.m.martins@oracle.com>
> Signed-off-by: David Edmondson <david.edmondson@oracle.com>
> ---
>  arch/x86/include/asm/kvm_host.h |  5 ++++
>  arch/x86/kvm/vmx/vmx.c          |  5 +---
>  arch/x86/kvm/x86.c              | 53 ++++++++++++++++++++++++++-------
>  include/uapi/linux/kvm.h        |  7 +++++
>  4 files changed, 56 insertions(+), 14 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index dfb902930cdc..17da43c1aa67 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1630,6 +1630,11 @@ extern u64 kvm_mce_cap_supported;
>  int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type);
>  int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu,
>  					void *insn, int insn_len);
> +void kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu,
> +					bool instruction_bytes,
> +					void *data, unsigned int ndata);
> +void kvm_prepare_emulation_failure_exit_with_reason(struct kvm_vcpu *vcpu,
> +						    bool instruction_bytes);
>  
>  void kvm_enable_efer_bits(u64);
>  bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer);
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index fefdecb0ff3d..a8d303c7c099 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -5367,10 +5367,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
>  
>  		if (vmx->emulation_required && !vmx->rmode.vm86_active &&
>  		    vcpu->arch.exception.pending) {
> -			vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
> -			vcpu->run->internal.suberror =
> -						KVM_INTERNAL_ERROR_EMULATION;
> -			vcpu->run->internal.ndata = 0;
> +			kvm_prepare_emulation_failure_exit_with_reason(vcpu, false);
>  			return 0;
>  		}
>  
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index a4fd10604f72..a97bacd8922f 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -7456,7 +7456,9 @@ void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
>  }
>  EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt);
>  
> -static void prepare_emulation_failure_exit(struct kvm_vcpu *vcpu)
> +void kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu,
> +					bool instruction_bytes,
> +					void *data, unsigned int ndata)

'data' should be a 'u64 *', and 'ndata' should be a 'u8' that is actual ndata as
opposed to the size.  The obvious alternative would be to rename ndata to size,
but IMO that's unnecessarily complex, it's much easier to force the caller to work
with u64s.  That also reduces the probablity of KVM mangling data[] by dumping
unrelated fields into a single data[] entry, or by leaving stale chunks (if the
incoming data is not a multiple of 8 bytes).

>  {
>  	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
>  	u32 insn_size = ctxt->fetch.end - ctxt->fetch.data;
> @@ -7464,10 +7466,10 @@ static void prepare_emulation_failure_exit(struct kvm_vcpu *vcpu)
>  
>  	run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
>  	run->emulation_failure.suberror = KVM_INTERNAL_ERROR_EMULATION;
> -	run->emulation_failure.ndata = 0;
> +	run->emulation_failure.ndata = 1; /* Always include the flags. */
>  	run->emulation_failure.flags = 0;
>  
> -	if (insn_size) {
> +	if (instruction_bytes && insn_size) {
>  		run->emulation_failure.ndata = 3;
>  		run->emulation_failure.flags |=
>  			KVM_INTERNAL_ERROR_EMULATION_FLAG_INSTRUCTION_BYTES;
> @@ -7477,7 +7479,42 @@ static void prepare_emulation_failure_exit(struct kvm_vcpu *vcpu)
>  		memcpy(run->emulation_failure.insn_bytes,
>  		       ctxt->fetch.data, insn_size);
>  	}
> +
> +	ndata = min((size_t)ndata, sizeof(run->internal.data) -
> +		    run->emulation_failure.ndata * sizeof(u64));
> +	if (ndata) {
> +		unsigned int offset =
> +			offsetof(struct kvm_run, emulation_failure.flags) +
> +			run->emulation_failure.ndata * sizeof(u64);
> +
> +		memcpy((void *)run + offset, data, ndata);
> +		run->emulation_failure.ndata += ndata / sizeof(u64);
> +	}
> +}
> +EXPORT_SYMBOL_GPL(kvm_prepare_emulation_failure_exit);

NAK on exporting this particular helper, it consumes vcpu->arch.emulate_ctxt,
which ideally wouldn't even be visible to vendor code (stupid SVM erratum).  The
emulation context will rarely, if ever, be valid if this is called from vendor code.

The SGX call is effectively guarded by instruction_bytes=false, but that's a
messy approach as the handle_emulation_failure() patch is the _only_ case where
emulate_ctxt can be valid.

> +void kvm_prepare_emulation_failure_exit_with_reason(struct kvm_vcpu *vcpu,
> +						    bool instruction_bytes)
> +{
> +	struct {
> +		__u64 exit_reason;

As mentioned in the prior patch, exit_reason is probably best kept to a u32 at
this time.

> +		__u64 exit_info1;
> +		__u64 exit_info2;
> +		__u32 intr_info;
> +		__u32 error_code;
> +	} exit_reason;

Oooh, you're dumping all the fields in kvm_run.  That took me forever to realize
because the struct is named "exit_reason".  Unless there's a naming conflict,
'data' would be the simplest, and if that's already taken, maybe 'info'?

I'm also not sure an anonymous struct is going to be the easiest to maintain.
I do like that the fields all have names, but on the other hand the data should
be padded so that each field is in its own data[] entry when dumped to userspace.
IMO, the padding complexity isn't worth the naming niceness since this code
doesn't actually care about what each field contains.

> +
> +	static_call(kvm_x86_get_exit_info)(vcpu,
> +					   &exit_reason.exit_reason,
> +					   &exit_reason.exit_info1,
> +					   &exit_reason.exit_info2,
> +					   &exit_reason.intr_info,
> +					   &exit_reason.error_code);
> +
> +	kvm_prepare_emulation_failure_exit(vcpu, instruction_bytes,
> +					   &exit_reason, sizeof(exit_reason));

Retrieiving the exit reason and info should probably be in the inner helper, the
only case where the info will be stale is the VMX !unrestricted_guest
handle_invalid_guest_state() path, but even then I think the last VM-Exit info
would be interesting/relevant.  That should allow for a cleaner API too.

This is what I came up with after a fair bit of massaging.  The get_exit_info()
call is beyond gross, but I still think I like it more than a struct?  A
build-time assertion that the struct size is a multiple of 8 would allay my
concerns over leaking stack state to userspace, so I'm not totally opposed to it.

	struct {
		u32 exit_reason;
		u32 pad1;
		u64 info1;
		u64 info2;
		u32 intr_info;
		u32 pad2;
		u32 error_code;
		u32 pad3;
	} info;
	u64 ninfo = sizeof(info) / sizeof(u64);

	BUILD_BUG_ON(sizeof(info) % sizeof(u64));

	/*
	 * Zero the whole struct used to retrieve the exit info to ensure the
	 * padding does not leak stack data to userspace.
	 */
	memset(&info, 0, sizeof(info));

	static_call(kvm_x86_get_exit_info)(vcpu, &info.exit_reason,
					   &info.info1, &info.info2,
					   &info.intr_info, &info.error_code);



void __kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu, u64 *data,
					  u8 ndata);
void kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu);

static void prepare_emulation_failure_exit(struct kvm_vcpu *vcpu, u64 *data,
					   u8 ndata, u8 *insn_bytes, u8 insn_size)
{
	struct kvm_run *run = vcpu->run;
	u8 ndata_start;
	u64 info[5];

	/*
	 * Zero the whole array used to retrieve the exit info, casting to u32
	 * for select entries will leave some chunks uninitialized.
	 */
	memset(&info, 0, sizeof(info));

	static_call(kvm_x86_get_exit_info)(vcpu, (u32 *)&info[0], &info[1],
					   &info[2], (u32 *)&info[3],
					   (u32 *)&info[4]);

	run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
	run->emulation_failure.suberror = KVM_INTERNAL_ERROR_EMULATION;

	/*
	 * There's currently space for 13 entries, but 5 are used for the exit
	 * reason and info.  Restrict to 4 to reduce the maintenance burden
	 * when expanding kvm_run.emulation_failure in the future.
	 */
	if (WARN_ON_ONCE(ndata > 4))
		ndata = 4;

	if (insn_size) {
		ndata_start = 3;
		run->emulation_failure.flags =
			KVM_INTERNAL_ERROR_EMULATION_FLAG_INSTRUCTION_BYTES;
		run->emulation_failure.insn_size = insn_size;
		memset(run->emulation_failure.insn_bytes, 0x90,
		       sizeof(run->emulation_failure.insn_bytes));
		memcpy(run->emulation_failure.insn_bytes, insn_bytes, insn_size);
	} else {
		/* Always include the flags as a 'data' entry. */
		ndata_start = 1;
		run->emulation_failure.flags = 0;
	}

	memcpy(&run->internal.data[ndata_start], info, ARRAY_SIZE(info));
	memcpy(&run->internal.data[ndata_start + ARRAY_SIZE(info)], data, ndata);
}

static void prepare_emulation_ctxt_failure_exit(struct kvm_vcpu *vcpu)
{
	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;

	prepare_emulation_failure_exit(vcpu, NULL, 0, ctxt->fetch.data,
				       ctxt->fetch.end - ctxt->fetch.data);
}

void __kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu, u64 *data,
					  u8 ndata)
{
	prepare_emulation_failure_exit(vcpu, data, ndata, NULL, 0);
}
EXPORT_SYMBOL_GPL(__kvm_prepare_emulation_failure_exit);

void kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu)
{
	__kvm_prepare_emulation_failure_exit(vcpu, NULL, 0);
}
EXPORT_SYMBOL_GPL(kvm_prepare_emulation_failure_exit);
David Edmondson Aug. 2, 2021, 7:28 a.m. UTC | #2
On Friday, 2021-07-30 at 22:14:48 GMT, Sean Christopherson wrote:

> On Thu, Jul 29, 2021, David Edmondson wrote:
>> Should instruction emulation fail, include the VM exit reason, etc. in
>> the emulation_failure data passed to userspace, in order that the VMM
>> can report it as a debugging aid when describing the failure.
>> 
>> Suggested-by: Joao Martins <joao.m.martins@oracle.com>
>> Signed-off-by: David Edmondson <david.edmondson@oracle.com>
>> ---
>>  arch/x86/include/asm/kvm_host.h |  5 ++++
>>  arch/x86/kvm/vmx/vmx.c          |  5 +---
>>  arch/x86/kvm/x86.c              | 53 ++++++++++++++++++++++++++-------
>>  include/uapi/linux/kvm.h        |  7 +++++
>>  4 files changed, 56 insertions(+), 14 deletions(-)
>> 
>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
>> index dfb902930cdc..17da43c1aa67 100644
>> --- a/arch/x86/include/asm/kvm_host.h
>> +++ b/arch/x86/include/asm/kvm_host.h
>> @@ -1630,6 +1630,11 @@ extern u64 kvm_mce_cap_supported;
>>  int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type);
>>  int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu,
>>  					void *insn, int insn_len);
>> +void kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu,
>> +					bool instruction_bytes,
>> +					void *data, unsigned int ndata);
>> +void kvm_prepare_emulation_failure_exit_with_reason(struct kvm_vcpu *vcpu,
>> +						    bool instruction_bytes);
>>  
>>  void kvm_enable_efer_bits(u64);
>>  bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer);
>> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
>> index fefdecb0ff3d..a8d303c7c099 100644
>> --- a/arch/x86/kvm/vmx/vmx.c
>> +++ b/arch/x86/kvm/vmx/vmx.c
>> @@ -5367,10 +5367,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
>>  
>>  		if (vmx->emulation_required && !vmx->rmode.vm86_active &&
>>  		    vcpu->arch.exception.pending) {
>> -			vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
>> -			vcpu->run->internal.suberror =
>> -						KVM_INTERNAL_ERROR_EMULATION;
>> -			vcpu->run->internal.ndata = 0;
>> +			kvm_prepare_emulation_failure_exit_with_reason(vcpu, false);
>>  			return 0;
>>  		}
>>  
>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>> index a4fd10604f72..a97bacd8922f 100644
>> --- a/arch/x86/kvm/x86.c
>> +++ b/arch/x86/kvm/x86.c
>> @@ -7456,7 +7456,9 @@ void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
>>  }
>>  EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt);
>>  
>> -static void prepare_emulation_failure_exit(struct kvm_vcpu *vcpu)
>> +void kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu,
>> +					bool instruction_bytes,
>> +					void *data, unsigned int ndata)
>
> 'data' should be a 'u64 *', and 'ndata' should be a 'u8' that is actual ndata as
> opposed to the size.

Agreed that this will be better.

> The obvious alternative would be to rename ndata to size, but IMO
> that's unnecessarily complex, it's much easier to force the caller to
> work with u64s.  That also reduces the probablity of KVM mangling
> data[] by dumping unrelated fields into a single data[] entry, or by
> leaving stale chunks (if the incoming data is not a multiple of 8
> bytes).
>
>>  {
>>  	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
>>  	u32 insn_size = ctxt->fetch.end - ctxt->fetch.data;
>> @@ -7464,10 +7466,10 @@ static void prepare_emulation_failure_exit(struct kvm_vcpu *vcpu)
>>  
>>  	run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
>>  	run->emulation_failure.suberror = KVM_INTERNAL_ERROR_EMULATION;
>> -	run->emulation_failure.ndata = 0;
>> +	run->emulation_failure.ndata = 1; /* Always include the flags. */
>>  	run->emulation_failure.flags = 0;
>>  
>> -	if (insn_size) {
>> +	if (instruction_bytes && insn_size) {
>>  		run->emulation_failure.ndata = 3;
>>  		run->emulation_failure.flags |=
>>  			KVM_INTERNAL_ERROR_EMULATION_FLAG_INSTRUCTION_BYTES;
>> @@ -7477,7 +7479,42 @@ static void prepare_emulation_failure_exit(struct kvm_vcpu *vcpu)
>>  		memcpy(run->emulation_failure.insn_bytes,
>>  		       ctxt->fetch.data, insn_size);
>>  	}
>> +
>> +	ndata = min((size_t)ndata, sizeof(run->internal.data) -
>> +		    run->emulation_failure.ndata * sizeof(u64));
>> +	if (ndata) {
>> +		unsigned int offset =
>> +			offsetof(struct kvm_run, emulation_failure.flags) +
>> +			run->emulation_failure.ndata * sizeof(u64);
>> +
>> +		memcpy((void *)run + offset, data, ndata);
>> +		run->emulation_failure.ndata += ndata / sizeof(u64);
>> +	}
>> +}
>> +EXPORT_SYMBOL_GPL(kvm_prepare_emulation_failure_exit);
>
> NAK on exporting this particular helper, it consumes vcpu->arch.emulate_ctxt,
> which ideally wouldn't even be visible to vendor code (stupid SVM erratum).  The
> emulation context will rarely, if ever, be valid if this is called from vendor code.
>
> The SGX call is effectively guarded by instruction_bytes=false, but that's a
> messy approach as the handle_emulation_failure() patch is the _only_ case where
> emulate_ctxt can be valid.

Your changes to address this seem sensible.

>> +void kvm_prepare_emulation_failure_exit_with_reason(struct kvm_vcpu *vcpu,
>> +						    bool instruction_bytes)
>> +{
>> +	struct {
>> +		__u64 exit_reason;
>
> As mentioned in the prior patch, exit_reason is probably best kept to a u32 at
> this time.

Ack.

>> +		__u64 exit_info1;
>> +		__u64 exit_info2;
>> +		__u32 intr_info;
>> +		__u32 error_code;
>> +	} exit_reason;
>
> Oooh, you're dumping all the fields in kvm_run.  That took me forever to realize
> because the struct is named "exit_reason".  Unless there's a naming conflict,
> 'data' would be the simplest, and if that's already taken, maybe 'info'?
>
> I'm also not sure an anonymous struct is going to be the easiest to maintain.
> I do like that the fields all have names, but on the other hand the data should
> be padded so that each field is in its own data[] entry when dumped to userspace.
> IMO, the padding complexity isn't worth the naming niceness since this code
> doesn't actually care about what each field contains.

Given that this is avowedly not an ABI and that we are expecting any
(human) consumer to be intimate with the implementation to make sense of
it, is there really any requirement or need for padding?

In your example below (most of which I'm fine with), the padding has the
effect of wasting space that could be used for another u64 of debug
data.

>> +
>> +	static_call(kvm_x86_get_exit_info)(vcpu,
>> +					   &exit_reason.exit_reason,
>> +					   &exit_reason.exit_info1,
>> +					   &exit_reason.exit_info2,
>> +					   &exit_reason.intr_info,
>> +					   &exit_reason.error_code);
>> +
>> +	kvm_prepare_emulation_failure_exit(vcpu, instruction_bytes,
>> +					   &exit_reason, sizeof(exit_reason));
>
> Retrieiving the exit reason and info should probably be in the inner helper, the
> only case where the info will be stale is the VMX !unrestricted_guest
> handle_invalid_guest_state() path, but even then I think the last VM-Exit info
> would be interesting/relevant.  That should allow for a cleaner API too.

Makes sense, thanks.

> This is what I came up with after a fair bit of massaging.  The get_exit_info()
> call is beyond gross, but I still think I like it more than a struct?  A
> build-time assertion that the struct size is a multiple of 8 would allay my
> concerns over leaking stack state to userspace, so I'm not totally opposed to it.
>
> 	struct {
> 		u32 exit_reason;
> 		u32 pad1;
> 		u64 info1;
> 		u64 info2;
> 		u32 intr_info;
> 		u32 pad2;
> 		u32 error_code;
> 		u32 pad3;
> 	} info;
> 	u64 ninfo = sizeof(info) / sizeof(u64);
>
> 	BUILD_BUG_ON(sizeof(info) % sizeof(u64));
>
> 	/*
> 	 * Zero the whole struct used to retrieve the exit info to ensure the
> 	 * padding does not leak stack data to userspace.
> 	 */
> 	memset(&info, 0, sizeof(info));
>
> 	static_call(kvm_x86_get_exit_info)(vcpu, &info.exit_reason,
> 					   &info.info1, &info.info2,
> 					   &info.intr_info, &info.error_code);
>
>
>
> void __kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu, u64 *data,
> 					  u8 ndata);
> void kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu);
>
> static void prepare_emulation_failure_exit(struct kvm_vcpu *vcpu, u64 *data,
> 					   u8 ndata, u8 *insn_bytes, u8 insn_size)
> {
> 	struct kvm_run *run = vcpu->run;
> 	u8 ndata_start;
> 	u64 info[5];
>
> 	/*
> 	 * Zero the whole array used to retrieve the exit info, casting to u32
> 	 * for select entries will leave some chunks uninitialized.
> 	 */
> 	memset(&info, 0, sizeof(info));
>
> 	static_call(kvm_x86_get_exit_info)(vcpu, (u32 *)&info[0], &info[1],
> 					   &info[2], (u32 *)&info[3],
> 					   (u32 *)&info[4]);
>
> 	run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
> 	run->emulation_failure.suberror = KVM_INTERNAL_ERROR_EMULATION;
>
> 	/*
> 	 * There's currently space for 13 entries, but 5 are used for the exit
> 	 * reason and info.  Restrict to 4 to reduce the maintenance burden
> 	 * when expanding kvm_run.emulation_failure in the future.
> 	 */
> 	if (WARN_ON_ONCE(ndata > 4))
> 		ndata = 4;
>
> 	if (insn_size) {
> 		ndata_start = 3;
> 		run->emulation_failure.flags =
> 			KVM_INTERNAL_ERROR_EMULATION_FLAG_INSTRUCTION_BYTES;
> 		run->emulation_failure.insn_size = insn_size;
> 		memset(run->emulation_failure.insn_bytes, 0x90,
> 		       sizeof(run->emulation_failure.insn_bytes));
> 		memcpy(run->emulation_failure.insn_bytes, insn_bytes, insn_size);
> 	} else {
> 		/* Always include the flags as a 'data' entry. */
> 		ndata_start = 1;
> 		run->emulation_failure.flags = 0;
> 	}

When we add another flag (presuming that we do, because if not there was
not much point in the flags) this will have to be restructured again. Is
there an objection to the original style? (prime ndata=1, flags=0, OR in
flags and adjust ndata as we go.)

> 	memcpy(&run->internal.data[ndata_start], info, ARRAY_SIZE(info));
> 	memcpy(&run->internal.data[ndata_start + ARRAY_SIZE(info)], data, ndata);
> }
>
> static void prepare_emulation_ctxt_failure_exit(struct kvm_vcpu *vcpu)
> {
> 	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
>
> 	prepare_emulation_failure_exit(vcpu, NULL, 0, ctxt->fetch.data,
> 				       ctxt->fetch.end - ctxt->fetch.data);
> }
>
> void __kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu, u64 *data,
> 					  u8 ndata)
> {
> 	prepare_emulation_failure_exit(vcpu, data, ndata, NULL, 0);
> }
> EXPORT_SYMBOL_GPL(__kvm_prepare_emulation_failure_exit);
>
> void kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu)
> {
> 	__kvm_prepare_emulation_failure_exit(vcpu, NULL, 0);
> }
> EXPORT_SYMBOL_GPL(kvm_prepare_emulation_failure_exit);
Sean Christopherson Aug. 2, 2021, 4:58 p.m. UTC | #3
On Mon, Aug 02, 2021, David Edmondson wrote:
> On Friday, 2021-07-30 at 22:14:48 GMT, Sean Christopherson wrote:
> 
> > On Thu, Jul 29, 2021, David Edmondson wrote:
> >> +		__u64 exit_info1;
> >> +		__u64 exit_info2;
> >> +		__u32 intr_info;
> >> +		__u32 error_code;
> >> +	} exit_reason;
> >
> > Oooh, you're dumping all the fields in kvm_run.  That took me forever to realize
> > because the struct is named "exit_reason".  Unless there's a naming conflict,
> > 'data' would be the simplest, and if that's already taken, maybe 'info'?
> >
> > I'm also not sure an anonymous struct is going to be the easiest to maintain.
> > I do like that the fields all have names, but on the other hand the data should
> > be padded so that each field is in its own data[] entry when dumped to userspace.
> > IMO, the padding complexity isn't worth the naming niceness since this code
> > doesn't actually care about what each field contains.
> 
> Given that this is avowedly not an ABI and that we are expecting any
> (human) consumer to be intimate with the implementation to make sense of
> it, is there really any requirement or need for padding?

My thought with the padding was to force each field into its own data[] entry.
E.g. if userspace does something like

	for (i = 0; i < ndata; i++)
		printf("\tdata[%d] = 0x%llx\n", i, data[i]);

then padding will yield

	data[0] = flags
	data[1] = exit_reason
	data[2] = exit_info1
	data[3] = exit_info2
	data[4] = intr_info
	data[5] = error_code

versus

	data[0] = <flags>
	data[1] = (exit_info1 << 32) | exit_reason
	data[2] = (exit_info2 << 32) | (exit_info1 >> 32)
	data[3] = (intr_info << 32) | (exit_info2 >> 32)
	data[4] = error_code

Changing exit_reason to a u64 would clean up the worst of the mangling, but until
there's actually a 64-bit exit reason to dump, that's just a more subtle way to
pad the data.

> In your example below (most of which I'm fine with), the padding has the
> effect of wasting space that could be used for another u64 of debug
> data.

Yes, but because it's not ABI, we can change it in the future if we get to the
point where we want to dump more info and don't have space.  Until that time, I
think it makes sense to prioritize readability with an ignorant (of the format)
userspace over memory footprint.

> > 	/*
> > 	 * There's currently space for 13 entries, but 5 are used for the exit
> > 	 * reason and info.  Restrict to 4 to reduce the maintenance burden
> > 	 * when expanding kvm_run.emulation_failure in the future.
> > 	 */
> > 	if (WARN_ON_ONCE(ndata > 4))
> > 		ndata = 4;
> >
> > 	if (insn_size) {
> > 		ndata_start = 3;
> > 		run->emulation_failure.flags =
> > 			KVM_INTERNAL_ERROR_EMULATION_FLAG_INSTRUCTION_BYTES;
> > 		run->emulation_failure.insn_size = insn_size;
> > 		memset(run->emulation_failure.insn_bytes, 0x90,
> > 		       sizeof(run->emulation_failure.insn_bytes));
> > 		memcpy(run->emulation_failure.insn_bytes, insn_bytes, insn_size);
> > 	} else {
> > 		/* Always include the flags as a 'data' entry. */
> > 		ndata_start = 1;
> > 		run->emulation_failure.flags = 0;
> > 	}
> 
> When we add another flag (presuming that we do, because if not there was
> not much point in the flags) this will have to be restructured again. Is
> there an objection to the original style? (prime ndata=1, flags=0, OR in
> flags and adjust ndata as we go.)

No objection, though if you OR in flags then you should truly _adjust_ ndata, not
set it, e.g.

        /* Always include the flags as a 'data' entry. */
        ndata_start = 1;
        run->emulation_failure.flags = 0;

        if (insn_size) {
                ndata_start += 2;  <----------------------- Adjust, not override
                run->emulation_failure.flags |=
                        KVM_INTERNAL_ERROR_EMULATION_FLAG_INSTRUCTION_BYTES;
                run->emulation_failure.insn_size = insn_size;
                memset(run->emulation_failure.insn_bytes, 0x90,
                       sizeof(run->emulation_failure.insn_bytes));
                memcpy(run->emulation_failure.insn_bytes, insn_bytes, insn_size);
        }

> > 	memcpy(&run->internal.data[ndata_start], info, ARRAY_SIZE(info));
> > 	memcpy(&run->internal.data[ndata_start + ARRAY_SIZE(info)], data, ndata);
> > }
David Edmondson Aug. 2, 2021, 5:23 p.m. UTC | #4
On Monday, 2021-08-02 at 16:58:03 GMT, Sean Christopherson wrote:

> On Mon, Aug 02, 2021, David Edmondson wrote:
>> On Friday, 2021-07-30 at 22:14:48 GMT, Sean Christopherson wrote:
>> 
>> > On Thu, Jul 29, 2021, David Edmondson wrote:
>> >> +		__u64 exit_info1;
>> >> +		__u64 exit_info2;
>> >> +		__u32 intr_info;
>> >> +		__u32 error_code;
>> >> +	} exit_reason;
>> >
>> > Oooh, you're dumping all the fields in kvm_run.  That took me forever to realize
>> > because the struct is named "exit_reason".  Unless there's a naming conflict,
>> > 'data' would be the simplest, and if that's already taken, maybe 'info'?
>> >
>> > I'm also not sure an anonymous struct is going to be the easiest to maintain.
>> > I do like that the fields all have names, but on the other hand the data should
>> > be padded so that each field is in its own data[] entry when dumped to userspace.
>> > IMO, the padding complexity isn't worth the naming niceness since this code
>> > doesn't actually care about what each field contains.
>> 
>> Given that this is avowedly not an ABI and that we are expecting any
>> (human) consumer to be intimate with the implementation to make sense of
>> it, is there really any requirement or need for padding?
>
> My thought with the padding was to force each field into its own data[] entry.
> E.g. if userspace does something like
>
> 	for (i = 0; i < ndata; i++)
> 		printf("\tdata[%d] = 0x%llx\n", i, data[i]);
>
> then padding will yield
>
> 	data[0] = flags
> 	data[1] = exit_reason
> 	data[2] = exit_info1
> 	data[3] = exit_info2
> 	data[4] = intr_info
> 	data[5] = error_code
>
> versus
>
> 	data[0] = <flags>
> 	data[1] = (exit_info1 << 32) | exit_reason
> 	data[2] = (exit_info2 << 32) | (exit_info1 >> 32)
> 	data[3] = (intr_info << 32) | (exit_info2 >> 32)
> 	data[4] = error_code
>
> Changing exit_reason to a u64 would clean up the worst of the mangling, but until
> there's actually a 64-bit exit reason to dump, that's just a more subtle way to
> pad the data.

Unnecessarily extending exit_reason to u64 would be bad, I agree.

>> In your example below (most of which I'm fine with), the padding has the
>> effect of wasting space that could be used for another u64 of debug
>> data.
>
> Yes, but because it's not ABI, we can change it in the future if we get to the
> point where we want to dump more info and don't have space.  Until that time, I
> think it makes sense to prioritize readability with an ignorant (of the format)
> userspace over memory footprint.

This seems reasonable.

>> > 	/*
>> > 	 * There's currently space for 13 entries, but 5 are used for the exit
>> > 	 * reason and info.  Restrict to 4 to reduce the maintenance burden
>> > 	 * when expanding kvm_run.emulation_failure in the future.
>> > 	 */
>> > 	if (WARN_ON_ONCE(ndata > 4))
>> > 		ndata = 4;
>> >
>> > 	if (insn_size) {
>> > 		ndata_start = 3;
>> > 		run->emulation_failure.flags =
>> > 			KVM_INTERNAL_ERROR_EMULATION_FLAG_INSTRUCTION_BYTES;
>> > 		run->emulation_failure.insn_size = insn_size;
>> > 		memset(run->emulation_failure.insn_bytes, 0x90,
>> > 		       sizeof(run->emulation_failure.insn_bytes));
>> > 		memcpy(run->emulation_failure.insn_bytes, insn_bytes, insn_size);
>> > 	} else {
>> > 		/* Always include the flags as a 'data' entry. */
>> > 		ndata_start = 1;
>> > 		run->emulation_failure.flags = 0;
>> > 	}
>> 
>> When we add another flag (presuming that we do, because if not there was
>> not much point in the flags) this will have to be restructured again. Is
>> there an objection to the original style? (prime ndata=1, flags=0, OR in
>> flags and adjust ndata as we go.)
>
> No objection, though if you OR in flags then you should truly _adjust_ ndata, not
> set it, e.g.

My understanding of Aaron's intent is that this would not be the
case.

That is, if we add another flag with payload and set that flag, we would
still have space for the instruction stream in data[] even if
KVM_INTERNAL_ERROR_EMULATION_FLAG_INSTRUCTION_BYTES is not set.

Given that, we must *set* ndata each time we add in a flag, with the
value being the extent of data[] used by the payload corresponding to
that flag, and the flags must be considered in ascending order (or we
remember a "max" along the way).

Dumping the arbitray debug data after the defined fields would require
adjusting ndata, of course.

If this is not the case, and the flag indicated payloads are packed at
the head of data[], then the current structure definition is misleading
and we should perhaps revise it.

>         /* Always include the flags as a 'data' entry. */
>         ndata_start = 1;
>         run->emulation_failure.flags = 0;
>
>         if (insn_size) {
>                 ndata_start += 2;  <----------------------- Adjust, not override
>                 run->emulation_failure.flags |=
>                         KVM_INTERNAL_ERROR_EMULATION_FLAG_INSTRUCTION_BYTES;
>                 run->emulation_failure.insn_size = insn_size;
>                 memset(run->emulation_failure.insn_bytes, 0x90,
>                        sizeof(run->emulation_failure.insn_bytes));
>                 memcpy(run->emulation_failure.insn_bytes, insn_bytes, insn_size);
>         }
>
>> > 	memcpy(&run->internal.data[ndata_start], info, ARRAY_SIZE(info));
>> > 	memcpy(&run->internal.data[ndata_start + ARRAY_SIZE(info)], data, ndata);
>> > }
Sean Christopherson Aug. 7, 2021, 12:59 a.m. UTC | #5
On Mon, Aug 02, 2021, David Edmondson wrote:
> On Monday, 2021-08-02 at 16:58:03 GMT, Sean Christopherson wrote:
> 
> > On Mon, Aug 02, 2021, David Edmondson wrote:
> >> On Friday, 2021-07-30 at 22:14:48 GMT, Sean Christopherson wrote:
> >> When we add another flag (presuming that we do, because if not there was
> >> not much point in the flags) this will have to be restructured again. Is
> >> there an objection to the original style? (prime ndata=1, flags=0, OR in
> >> flags and adjust ndata as we go.)
> >
> > No objection, though if you OR in flags then you should truly _adjust_ ndata, not
> > set it, e.g.
> 
> My understanding of Aaron's intent is that this would not be the case.
> 
> That is, if we add another flag with payload and set that flag, we would
> still have space for the instruction stream in data[] even if
> KVM_INTERNAL_ERROR_EMULATION_FLAG_INSTRUCTION_BYTES is not set.

Hmm, I don't think we have to make that decision yet.  Userspace must check the
flag before consuming run->emulation_failure, so we haven't fully commited one
way or the other.

I believe the original thought was indeed to skip unused fields, but I don't think
that's actually the behavior we want for completely unrelated fields, i.e. flag
combinations that will _never_ be valid together.  The only reason to skip fields
would be to keep the offset of a particular field constant, so I think the rule
can be that if fields that can coexist but are controlled by different flags, they
must be in the same anonymous struct, but in general a union is ok.

It seems rather unlikely that we'll gain many more flags, but it would be really
unfortunate if we commit to skipping fields and then run out of space because of
that decision.

> Given that, we must *set* ndata each time we add in a flag

Technically we wouldn't have to (being more than a bit pedantic), we could keep
the same flow and just do the ndata_start bump outside of the OR path, e.g.

        /* Always include the flags as a 'data' entry. */
        ndata_start = 1;
        run->emulation_failure.flags = 0;

        /* Skip unused fields instead of overloading them when they're not used. */
        ndata_start += 2;
        if (insn_size) {
                run->emulation_failure.flags |=
                        KVM_INTERNAL_ERROR_EMULATION_FLAG_INSTRUCTION_BYTES;
                run->emulation_failure.insn_size = insn_size;
                memset(run->emulation_failure.insn_bytes, 0x90,
                       sizeof(run->emulation_failure.insn_bytes));
                memcpy(run->emulation_failure.insn_bytes, insn_bytes, insn_size);
        }

so that it's easier to understand the magic numbers used to adjust ndata_start.

But a better solution would be to have no magic numbers at all and set ndata_start
via sizeof(run->emulation_failure).  E.g.

	/*
	 * There's currently space for 13 entries, but 5 are used for the exit
	 * reason and info.  Restrict to 4 to reduce the maintenance burden
	 * when expanding kvm_run.emulation_failure in the future.
	 */
	if (WARN_ON_ONCE(ndata > 4))
		ndata = 4;

	ndata_start = sizeof(run->emulation_failure);
	memcpy(&run->internal.data[], info, ARRAY_SIZE(info));
	memcpy(&run->internal.data[ndata_start + ARRAY_SIZE(info)], data, ndata);

	run->internal.ndata = ndata_start + ARRAY_SIZE(info) + ndata;

Though I'd prefer we not skip fields, at least not yet, e.g. to condition userspace
to do the right thing if we decide to not skip when adding a second flag (if that
even ever happens).

> with the value being the extent of data[] used by the payload corresponding
> to that flag, and the flags must be considered in ascending order (or we
> remember a "max" along the way).
> 
> Dumping the arbitray debug data after the defined fields would require
> adjusting ndata, of course.
> 
> If this is not the case, and the flag indicated payloads are packed at
> the head of data[], then the current structure definition is misleading
> and we should perhaps revise it.

Ah, because there's no wrapping union.  I wouldn't object to something like this
to hint to userpace that the struct layout may not be purely additive in the
future.

diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h
index d9e4aabcb31a..6c79c1ce3703 100644
--- a/tools/include/uapi/linux/kvm.h
+++ b/tools/include/uapi/linux/kvm.h
@@ -402,8 +402,12 @@ struct kvm_run {
                        __u32 suberror;
                        __u32 ndata;
                        __u64 flags;
-                       __u8  insn_size;
-                       __u8  insn_bytes[15];
+                       union {
+                               struct {
+                                       __u8  insn_size;
+                                       __u8  insn_bytes[15];
+                               };
+                       };
                } emulation_failure;
                /* KVM_EXIT_OSI */
                struct {
diff mbox series

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index dfb902930cdc..17da43c1aa67 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1630,6 +1630,11 @@  extern u64 kvm_mce_cap_supported;
 int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type);
 int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu,
 					void *insn, int insn_len);
+void kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu,
+					bool instruction_bytes,
+					void *data, unsigned int ndata);
+void kvm_prepare_emulation_failure_exit_with_reason(struct kvm_vcpu *vcpu,
+						    bool instruction_bytes);
 
 void kvm_enable_efer_bits(u64);
 bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer);
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index fefdecb0ff3d..a8d303c7c099 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -5367,10 +5367,7 @@  static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
 
 		if (vmx->emulation_required && !vmx->rmode.vm86_active &&
 		    vcpu->arch.exception.pending) {
-			vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-			vcpu->run->internal.suberror =
-						KVM_INTERNAL_ERROR_EMULATION;
-			vcpu->run->internal.ndata = 0;
+			kvm_prepare_emulation_failure_exit_with_reason(vcpu, false);
 			return 0;
 		}
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a4fd10604f72..a97bacd8922f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7456,7 +7456,9 @@  void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
 }
 EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt);
 
-static void prepare_emulation_failure_exit(struct kvm_vcpu *vcpu)
+void kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu,
+					bool instruction_bytes,
+					void *data, unsigned int ndata)
 {
 	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
 	u32 insn_size = ctxt->fetch.end - ctxt->fetch.data;
@@ -7464,10 +7466,10 @@  static void prepare_emulation_failure_exit(struct kvm_vcpu *vcpu)
 
 	run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
 	run->emulation_failure.suberror = KVM_INTERNAL_ERROR_EMULATION;
-	run->emulation_failure.ndata = 0;
+	run->emulation_failure.ndata = 1; /* Always include the flags. */
 	run->emulation_failure.flags = 0;
 
-	if (insn_size) {
+	if (instruction_bytes && insn_size) {
 		run->emulation_failure.ndata = 3;
 		run->emulation_failure.flags |=
 			KVM_INTERNAL_ERROR_EMULATION_FLAG_INSTRUCTION_BYTES;
@@ -7477,7 +7479,42 @@  static void prepare_emulation_failure_exit(struct kvm_vcpu *vcpu)
 		memcpy(run->emulation_failure.insn_bytes,
 		       ctxt->fetch.data, insn_size);
 	}
+
+	ndata = min((size_t)ndata, sizeof(run->internal.data) -
+		    run->emulation_failure.ndata * sizeof(u64));
+	if (ndata) {
+		unsigned int offset =
+			offsetof(struct kvm_run, emulation_failure.flags) +
+			run->emulation_failure.ndata * sizeof(u64);
+
+		memcpy((void *)run + offset, data, ndata);
+		run->emulation_failure.ndata += ndata / sizeof(u64);
+	}
+}
+EXPORT_SYMBOL_GPL(kvm_prepare_emulation_failure_exit);
+
+void kvm_prepare_emulation_failure_exit_with_reason(struct kvm_vcpu *vcpu,
+						    bool instruction_bytes)
+{
+	struct {
+		__u64 exit_reason;
+		__u64 exit_info1;
+		__u64 exit_info2;
+		__u32 intr_info;
+		__u32 error_code;
+	} exit_reason;
+
+	static_call(kvm_x86_get_exit_info)(vcpu,
+					   &exit_reason.exit_reason,
+					   &exit_reason.exit_info1,
+					   &exit_reason.exit_info2,
+					   &exit_reason.intr_info,
+					   &exit_reason.error_code);
+
+	kvm_prepare_emulation_failure_exit(vcpu, instruction_bytes,
+					   &exit_reason, sizeof(exit_reason));
 }
+EXPORT_SYMBOL_GPL(kvm_prepare_emulation_failure_exit_with_reason);
 
 static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
 {
@@ -7493,16 +7530,14 @@  static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
 
 	if (kvm->arch.exit_on_emulation_error ||
 	    (emulation_type & EMULTYPE_SKIP)) {
-		prepare_emulation_failure_exit(vcpu);
+		kvm_prepare_emulation_failure_exit_with_reason(vcpu, true);
 		return 0;
 	}
 
 	kvm_queue_exception(vcpu, UD_VECTOR);
 
 	if (!is_guest_mode(vcpu) && static_call(kvm_x86_get_cpl)(vcpu) == 0) {
-		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-		vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
-		vcpu->run->internal.ndata = 0;
+		kvm_prepare_emulation_failure_exit_with_reason(vcpu, false);
 		return 0;
 	}
 
@@ -12095,9 +12130,7 @@  int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r,
 	 * doesn't seem to be a real use-case behind such requests, just return
 	 * KVM_EXIT_INTERNAL_ERROR for now.
 	 */
-	vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-	vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
-	vcpu->run->internal.ndata = 0;
+	kvm_prepare_emulation_failure_exit_with_reason(vcpu, false);
 
 	return 0;
 }
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index d9e4aabcb31a..f1ef4117b824 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -397,6 +397,12 @@  struct kvm_run {
 		 * "ndata" is correct, that new fields are enumerated in "flags",
 		 * and that each flag enumerates fields that are 64-bit aligned
 		 * and sized (so that ndata+internal.data[] is valid/accurate).
+		 *
+		 * Space beyond the defined fields may be used to
+		 * store arbitrary debug information relating to the
+		 * emulation failure. It is accounted for in "ndata"
+		 * but otherwise unspecified and is not represented in
+		 * "flags".
 		 */
 		struct {
 			__u32 suberror;
@@ -404,6 +410,7 @@  struct kvm_run {
 			__u64 flags;
 			__u8  insn_size;
 			__u8  insn_bytes[15];
+			/* Arbitrary debug data may follow. */
 		} emulation_failure;
 		/* KVM_EXIT_OSI */
 		struct {