diff mbox series

[v4,53/66] i386/tdx: Wire TDX_REPORT_FATAL_ERROR with GuestPanic facility

Message ID 20240125032328.2522472-54-xiaoyao.li@intel.com (mailing list archive)
State New, archived
Headers show
Series QEMU Guest memfd + QEMU TDX support | expand

Commit Message

Xiaoyao Li Jan. 25, 2024, 3:23 a.m. UTC
Integrate TDX's TDX_REPORT_FATAL_ERROR into QEMU GuestPanic facility

Originated-from: Isaku Yamahata <isaku.yamahata@intel.com>
Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com>
---
Changes in v4:
- refine the documentation; (Markus)

Changes in v3:
- Add docmentation of new type and struct; (Daniel)
- refine the error message handling; (Daniel)
---
 qapi/run-state.json   | 28 ++++++++++++++++++++--
 system/runstate.c     | 54 +++++++++++++++++++++++++++++++++++++++++++
 target/i386/kvm/tdx.c | 24 ++++++++++++++++++-
 3 files changed, 103 insertions(+), 3 deletions(-)

Comments

Markus Armbruster Feb. 19, 2024, 12:53 p.m. UTC | #1
Xiaoyao Li <xiaoyao.li@intel.com> writes:

> Integrate TDX's TDX_REPORT_FATAL_ERROR into QEMU GuestPanic facility
>
> Originated-from: Isaku Yamahata <isaku.yamahata@intel.com>
> Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com>
> ---
> Changes in v4:
> - refine the documentation; (Markus)
>
> Changes in v3:
> - Add docmentation of new type and struct; (Daniel)
> - refine the error message handling; (Daniel)
> ---
>  qapi/run-state.json   | 28 ++++++++++++++++++++--
>  system/runstate.c     | 54 +++++++++++++++++++++++++++++++++++++++++++
>  target/i386/kvm/tdx.c | 24 ++++++++++++++++++-
>  3 files changed, 103 insertions(+), 3 deletions(-)
>
> diff --git a/qapi/run-state.json b/qapi/run-state.json
> index 08bc99cb8561..5429116679e3 100644
> --- a/qapi/run-state.json
> +++ b/qapi/run-state.json
> @@ -485,10 +485,12 @@
>  #
>  # @s390: s390 guest panic information type (Since: 2.12)
>  #
> +# @tdx: tdx guest panic information type (Since: 8.2)
> +#
>  # Since: 2.9
>  ##
>  { 'enum': 'GuestPanicInformationType',
> -  'data': [ 'hyper-v', 's390' ] }
> +  'data': [ 'hyper-v', 's390', 'tdx' ] }
>  
>  ##
>  # @GuestPanicInformation:
> @@ -503,7 +505,8 @@
>   'base': {'type': 'GuestPanicInformationType'},
>   'discriminator': 'type',
>   'data': {'hyper-v': 'GuestPanicInformationHyperV',
> -          's390': 'GuestPanicInformationS390'}}
> +          's390': 'GuestPanicInformationS390',
> +          'tdx' : 'GuestPanicInformationTdx'}}
>  
>  ##
>  # @GuestPanicInformationHyperV:
> @@ -566,6 +569,27 @@
>            'psw-addr': 'uint64',
>            'reason': 'S390CrashReason'}}
>  
> +##
> +# @GuestPanicInformationTdx:
> +#
> +# TDX Guest panic information specific to TDX GCHI
> +# TDG.VP.VMCALL<ReportFatalError>.
> +#
> +# @error-code: TD-specific error code

Where could a user find information on these error codes?

> +#
> +# @gpa: guest-physical address of a page that contains additional
> +#     error data, in forms of zero-terminated string.

"in the form of a zero-terminated string"

> +#
> +# @message: Human-readable error message provided by the guest. Not
> +#     to be trusted.

How is this message related to the one pointed to by @gpa?

> +#
> +# Since: 9.0
> +##
> +{'struct': 'GuestPanicInformationTdx',
> + 'data': {'error-code': 'uint64',
> +          'gpa': 'uint64',
> +          'message': 'str'}}
> +
>  ##
>  # @MEMORY_FAILURE:
>  #

[...]
Xiaoyao Li Feb. 27, 2024, 9:51 a.m. UTC | #2
On 2/19/2024 8:53 PM, Markus Armbruster wrote:
> Xiaoyao Li <xiaoyao.li@intel.com> writes:
> 
>> Integrate TDX's TDX_REPORT_FATAL_ERROR into QEMU GuestPanic facility
>>
>> Originated-from: Isaku Yamahata <isaku.yamahata@intel.com>
>> Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com>
>> ---
>> Changes in v4:
>> - refine the documentation; (Markus)
>>
>> Changes in v3:
>> - Add docmentation of new type and struct; (Daniel)
>> - refine the error message handling; (Daniel)
>> ---
>>   qapi/run-state.json   | 28 ++++++++++++++++++++--
>>   system/runstate.c     | 54 +++++++++++++++++++++++++++++++++++++++++++
>>   target/i386/kvm/tdx.c | 24 ++++++++++++++++++-
>>   3 files changed, 103 insertions(+), 3 deletions(-)
>>
>> diff --git a/qapi/run-state.json b/qapi/run-state.json
>> index 08bc99cb8561..5429116679e3 100644
>> --- a/qapi/run-state.json
>> +++ b/qapi/run-state.json
>> @@ -485,10 +485,12 @@
>>   #
>>   # @s390: s390 guest panic information type (Since: 2.12)
>>   #
>> +# @tdx: tdx guest panic information type (Since: 8.2)
>> +#
>>   # Since: 2.9
>>   ##
>>   { 'enum': 'GuestPanicInformationType',
>> -  'data': [ 'hyper-v', 's390' ] }
>> +  'data': [ 'hyper-v', 's390', 'tdx' ] }
>>   
>>   ##
>>   # @GuestPanicInformation:
>> @@ -503,7 +505,8 @@
>>    'base': {'type': 'GuestPanicInformationType'},
>>    'discriminator': 'type',
>>    'data': {'hyper-v': 'GuestPanicInformationHyperV',
>> -          's390': 'GuestPanicInformationS390'}}
>> +          's390': 'GuestPanicInformationS390',
>> +          'tdx' : 'GuestPanicInformationTdx'}}
>>   
>>   ##
>>   # @GuestPanicInformationHyperV:
>> @@ -566,6 +569,27 @@
>>             'psw-addr': 'uint64',
>>             'reason': 'S390CrashReason'}}
>>   
>> +##
>> +# @GuestPanicInformationTdx:
>> +#
>> +# TDX Guest panic information specific to TDX GCHI
>> +# TDG.VP.VMCALL<ReportFatalError>.
>> +#
>> +# @error-code: TD-specific error code
> 
> Where could a user find information on these error codes?

TDX GHCI (Guset-host-communication-Interface)spec. It defines all the 
TDVMCALL leaves.

0: panic;
0x1 - 0xffffffff: reserved.

>> +#
>> +# @gpa: guest-physical address of a page that contains additional
>> +#     error data, in forms of zero-terminated string.
> 
> "in the form of a zero-terminated string"

fixed.

>> +#
>> +# @message: Human-readable error message provided by the guest. Not
>> +#     to be trusted.
> 
> How is this message related to the one pointed to by @gpa?

In general, @message contains a brief message of the error. While @gpa 
(when valid) contains a verbose message.

The reason why we need both is because sometime when TD guest hits a 
fatal error, its memory may get corrupted so we cannot pass information 
via @gpa. Information in @message is passed through GPRs.

>> +#
>> +# Since: 9.0
>> +##
>> +{'struct': 'GuestPanicInformationTdx',
>> + 'data': {'error-code': 'uint64',
>> +          'gpa': 'uint64',
>> +          'message': 'str'}}
>> +
>>   ##
>>   # @MEMORY_FAILURE:
>>   #
> 
> [...]
>
Markus Armbruster Feb. 27, 2024, 11:51 a.m. UTC | #3
Xiaoyao Li <xiaoyao.li@intel.com> writes:

> On 2/19/2024 8:53 PM, Markus Armbruster wrote:
>> Xiaoyao Li <xiaoyao.li@intel.com> writes:
>> 
>>> Integrate TDX's TDX_REPORT_FATAL_ERROR into QEMU GuestPanic facility
>>>
>>> Originated-from: Isaku Yamahata <isaku.yamahata@intel.com>
>>> Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com>
>>> ---
>>> Changes in v4:
>>> - refine the documentation; (Markus)
>>>
>>> Changes in v3:
>>> - Add docmentation of new type and struct; (Daniel)
>>> - refine the error message handling; (Daniel)
>>> ---
>>>   qapi/run-state.json   | 28 ++++++++++++++++++++--
>>>   system/runstate.c     | 54 +++++++++++++++++++++++++++++++++++++++++++
>>>   target/i386/kvm/tdx.c | 24 ++++++++++++++++++-
>>>   3 files changed, 103 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/qapi/run-state.json b/qapi/run-state.json
>>> index 08bc99cb8561..5429116679e3 100644
>>> --- a/qapi/run-state.json
>>> +++ b/qapi/run-state.json
>>> @@ -485,10 +485,12 @@
>>>  #
>>>  # @s390: s390 guest panic information type (Since: 2.12)
>>>  #
>>> +# @tdx: tdx guest panic information type (Since: 8.2)
>>> +#
>>>  # Since: 2.9
>>>  ##
>>>  { 'enum': 'GuestPanicInformationType',
>>> -  'data': [ 'hyper-v', 's390' ] }
>>> +  'data': [ 'hyper-v', 's390', 'tdx' ] }
>>>   
>>>  ##
>>>  # @GuestPanicInformation:
>>> @@ -503,7 +505,8 @@
>>>    'base': {'type': 'GuestPanicInformationType'},
>>>    'discriminator': 'type',
>>>    'data': {'hyper-v': 'GuestPanicInformationHyperV',
>>> -          's390': 'GuestPanicInformationS390'}}
>>> +          's390': 'GuestPanicInformationS390',
>>> +          'tdx' : 'GuestPanicInformationTdx'}}
>>>   
>>>  ##
>>>  # @GuestPanicInformationHyperV:
>>> @@ -566,6 +569,27 @@
>>>             'psw-addr': 'uint64',
>>>             'reason': 'S390CrashReason'}}
>>>   
>>> +##
>>> +# @GuestPanicInformationTdx:
>>> +#
>>> +# TDX Guest panic information specific to TDX GCHI
>>> +# TDG.VP.VMCALL<ReportFatalError>.
>>> +#
>>> +# @error-code: TD-specific error code
>> 
>> Where could a user find information on these error codes?
>
> TDX GHCI (Guset-host-communication-Interface)spec. It defines all the 
> TDVMCALL leaves.
>
> 0: panic;
> 0x1 - 0xffffffff: reserved.

Would it make sense to add a reference?

>>> +#
>>> +# @gpa: guest-physical address of a page that contains additional
>>> +#     error data, in forms of zero-terminated string.
>> 
>> "in the form of a zero-terminated string"
>
> fixed.
>
>>> +#
>>> +# @message: Human-readable error message provided by the guest. Not
>>> +#     to be trusted.
>> 
>> How is this message related to the one pointed to by @gpa?
>
> In general, @message contains a brief message of the error. While @gpa 
> (when valid) contains a verbose message.
>
> The reason why we need both is because sometime when TD guest hits a 
> fatal error, its memory may get corrupted so we cannot pass information 
> via @gpa. Information in @message is passed through GPRs.

Well, we do pass information via @gpa, always.  I guess it page's
contents can be corrupted.

Perhaps something like

    # @message: Human-readable error message provided by the guest.  Not
    #     to be trusted.
    #
    # @gpa: guest-physical address of a page that contains more verbose 
    #     error information, as zero-terminated string.  Note that guest
    #     memory corruption can corrupt the page's contents.

>>> +#
>>> +# Since: 9.0
>>> +##
>>> +{'struct': 'GuestPanicInformationTdx',
>>> + 'data': {'error-code': 'uint64',
>>> +          'gpa': 'uint64',
>>> +          'message': 'str'}}

Note that my proposed doc string has the members in a different order.
Recommend to use the same order here.

>>> +
>>>   ##
>>>   # @MEMORY_FAILURE:
>>>   #
>> 
>> [...]
>>
Xiaoyao Li Feb. 27, 2024, 12:09 p.m. UTC | #4
On 2/27/2024 7:51 PM, Markus Armbruster wrote:
> Xiaoyao Li <xiaoyao.li@intel.com> writes:
> 
>> On 2/19/2024 8:53 PM, Markus Armbruster wrote:
>>> Xiaoyao Li <xiaoyao.li@intel.com> writes:
>>>
>>>> Integrate TDX's TDX_REPORT_FATAL_ERROR into QEMU GuestPanic facility
>>>>
>>>> Originated-from: Isaku Yamahata <isaku.yamahata@intel.com>
>>>> Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com>
>>>> ---
>>>> Changes in v4:
>>>> - refine the documentation; (Markus)
>>>>
>>>> Changes in v3:
>>>> - Add docmentation of new type and struct; (Daniel)
>>>> - refine the error message handling; (Daniel)
>>>> ---
>>>>    qapi/run-state.json   | 28 ++++++++++++++++++++--
>>>>    system/runstate.c     | 54 +++++++++++++++++++++++++++++++++++++++++++
>>>>    target/i386/kvm/tdx.c | 24 ++++++++++++++++++-
>>>>    3 files changed, 103 insertions(+), 3 deletions(-)
>>>>
>>>> diff --git a/qapi/run-state.json b/qapi/run-state.json
>>>> index 08bc99cb8561..5429116679e3 100644
>>>> --- a/qapi/run-state.json
>>>> +++ b/qapi/run-state.json
>>>> @@ -485,10 +485,12 @@
>>>>   #
>>>>   # @s390: s390 guest panic information type (Since: 2.12)
>>>>   #
>>>> +# @tdx: tdx guest panic information type (Since: 8.2)
>>>> +#
>>>>   # Since: 2.9
>>>>   ##
>>>>   { 'enum': 'GuestPanicInformationType',
>>>> -  'data': [ 'hyper-v', 's390' ] }
>>>> +  'data': [ 'hyper-v', 's390', 'tdx' ] }
>>>>    
>>>>   ##
>>>>   # @GuestPanicInformation:
>>>> @@ -503,7 +505,8 @@
>>>>     'base': {'type': 'GuestPanicInformationType'},
>>>>     'discriminator': 'type',
>>>>     'data': {'hyper-v': 'GuestPanicInformationHyperV',
>>>> -          's390': 'GuestPanicInformationS390'}}
>>>> +          's390': 'GuestPanicInformationS390',
>>>> +          'tdx' : 'GuestPanicInformationTdx'}}
>>>>    
>>>>   ##
>>>>   # @GuestPanicInformationHyperV:
>>>> @@ -566,6 +569,27 @@
>>>>              'psw-addr': 'uint64',
>>>>              'reason': 'S390CrashReason'}}
>>>>    
>>>> +##
>>>> +# @GuestPanicInformationTdx:
>>>> +#
>>>> +# TDX Guest panic information specific to TDX GCHI
>>>> +# TDG.VP.VMCALL<ReportFatalError>.
>>>> +#
>>>> +# @error-code: TD-specific error code
>>>
>>> Where could a user find information on these error codes?
>>
>> TDX GHCI (Guset-host-communication-Interface)spec. It defines all the
>> TDVMCALL leaves.
>>
>> 0: panic;
>> 0x1 - 0xffffffff: reserved.
> 
> Would it make sense to add a reference?

https://cdrdv2.intel.com/v1/dl/getContent/726792

>>>> +#
>>>> +# @gpa: guest-physical address of a page that contains additional
>>>> +#     error data, in forms of zero-terminated string.
>>>
>>> "in the form of a zero-terminated string"
>>
>> fixed.
>>
>>>> +#
>>>> +# @message: Human-readable error message provided by the guest. Not
>>>> +#     to be trusted.
>>>
>>> How is this message related to the one pointed to by @gpa?
>>
>> In general, @message contains a brief message of the error. While @gpa
>> (when valid) contains a verbose message.
>>
>> The reason why we need both is because sometime when TD guest hits a
>> fatal error, its memory may get corrupted so we cannot pass information
>> via @gpa. Information in @message is passed through GPRs.
> 
> Well, we do pass information via @gpa, always.  I guess it page's
> contents can be corrupted.

No. It's not always. the bit 63 of the error code is "GPA valid" bit. 
@gpa is valid only when bit 63 of error code is 1.

And current Linux TD guest implementation doesn't use @gpa at all.
https://github.com/torvalds/linux/blob/45ec2f5f6ed3ec3a79ba1329ad585497cdcbe663/arch/x86/coco/tdx/tdx.c#L131 


> Perhaps something like
> 
>      # @message: Human-readable error message provided by the guest.  Not
>      #     to be trusted.
>      #
>      # @gpa: guest-physical address of a page that contains more verbose
>      #     error information, as zero-terminated string.  Note that guest
>      #     memory corruption can corrupt the page's contents.
> 
>>>> +#
>>>> +# Since: 9.0
>>>> +##
>>>> +{'struct': 'GuestPanicInformationTdx',
>>>> + 'data': {'error-code': 'uint64',
>>>> +          'gpa': 'uint64',
>>>> +          'message': 'str'}}
> 
> Note that my proposed doc string has the members in a different order.
> Recommend to use the same order here.
> 
>>>> +
>>>>    ##
>>>>    # @MEMORY_FAILURE:
>>>>    #
>>>
>>> [...]
>>>
>
Markus Armbruster Feb. 27, 2024, 1:09 p.m. UTC | #5
Xiaoyao Li <xiaoyao.li@intel.com> writes:

> On 2/27/2024 7:51 PM, Markus Armbruster wrote:
>> Xiaoyao Li <xiaoyao.li@intel.com> writes:
>> 
>>> On 2/19/2024 8:53 PM, Markus Armbruster wrote:
>>>> Xiaoyao Li <xiaoyao.li@intel.com> writes:
>>>>
>>>>> Integrate TDX's TDX_REPORT_FATAL_ERROR into QEMU GuestPanic facility
>>>>>
>>>>> Originated-from: Isaku Yamahata <isaku.yamahata@intel.com>
>>>>> Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com>
>>>>> ---
>>>>> Changes in v4:
>>>>> - refine the documentation; (Markus)
>>>>>
>>>>> Changes in v3:
>>>>> - Add docmentation of new type and struct; (Daniel)
>>>>> - refine the error message handling; (Daniel)
>>>>> ---
>>>>>    qapi/run-state.json   | 28 ++++++++++++++++++++--
>>>>>    system/runstate.c     | 54 +++++++++++++++++++++++++++++++++++++++++++
>>>>>    target/i386/kvm/tdx.c | 24 ++++++++++++++++++-
>>>>>    3 files changed, 103 insertions(+), 3 deletions(-)
>>>>>
>>>>> diff --git a/qapi/run-state.json b/qapi/run-state.json
>>>>> index 08bc99cb8561..5429116679e3 100644
>>>>> --- a/qapi/run-state.json
>>>>> +++ b/qapi/run-state.json

[...]

>>>>> @@ -566,6 +569,27 @@
>>>>>              'psw-addr': 'uint64',
>>>>>              'reason': 'S390CrashReason'}}
>>>>>    
>>>>> +##
>>>>> +# @GuestPanicInformationTdx:
>>>>> +#
>>>>> +# TDX Guest panic information specific to TDX GCHI
>>>>> +# TDG.VP.VMCALL<ReportFatalError>.
>>>>> +#
>>>>> +# @error-code: TD-specific error code
>>>>
>>>> Where could a user find information on these error codes?
>>>
>>> TDX GHCI (Guset-host-communication-Interface)spec. It defines all the
>>> TDVMCALL leaves.
>>>
>>> 0: panic;
>>> 0x1 - 0xffffffff: reserved.
>> 
>> Would it make sense to add a reference?
>
> https://cdrdv2.intel.com/v1/dl/getContent/726792

URLs have this annoying tendency to rot.

What about

# @error-code: Error code as defined in "Guest-Hypervisor Communication
#     Interface (GHCI) Specification for Intel TDX 1.5"

>>>>> +#
>>>>> +# @gpa: guest-physical address of a page that contains additional
>>>>> +#     error data, in forms of zero-terminated string.
>>>>
>>>> "in the form of a zero-terminated string"
>>>
>>> fixed.
>>>
>>>>> +#
>>>>> +# @message: Human-readable error message provided by the guest. Not
>>>>> +#     to be trusted.
>>>>
>>>> How is this message related to the one pointed to by @gpa?
>>>
>>> In general, @message contains a brief message of the error. While @gpa
>>> (when valid) contains a verbose message.
>>>
>>> The reason why we need both is because sometime when TD guest hits a
>>> fatal error, its memory may get corrupted so we cannot pass information
>>> via @gpa. Information in @message is passed through GPRs.
>> 
>> Well, we do pass information via @gpa, always.  I guess it page's
>> contents can be corrupted.
>
> No. It's not always. the bit 63 of the error code is "GPA valid" bit. 
> @gpa is valid only when bit 63 of error code is 1.
>
> And current Linux TD guest implementation doesn't use @gpa at all.
> https://github.com/torvalds/linux/blob/45ec2f5f6ed3ec3a79ba1329ad585497cdcbe663/arch/x86/coco/tdx/tdx.c#L131 

Aha!

Why would we want to include @gpa when the "GPA valid" bit is off?

If we do want it, then

# @gpa: guest-physical address of a page that contains more verbose
#     error information, as zero-terminated string.  Valid when the
#     "GPA valid" bit is set in @error-code.

If we don't, then make @gpa optional, present when valid, and document
it like this:

# @gpa: guest-physical address of a page that contains more verbose
#     error information, as zero-terminated string.  Present when the
#     "GPA valid" bit is set in @error-code.

>> Perhaps something like
>> 
>>      # @message: Human-readable error message provided by the guest.  Not
>>      #     to be trusted.
>>      #
>>      # @gpa: guest-physical address of a page that contains more verbose
>>      #     error information, as zero-terminated string.  Note that guest
>>      #     memory corruption can corrupt the page's contents.
>> 
>>>>> +#
>>>>> +# Since: 9.0
>>>>> +##
>>>>> +{'struct': 'GuestPanicInformationTdx',
>>>>> + 'data': {'error-code': 'uint64',
>>>>> +          'gpa': 'uint64',
>>>>> +          'message': 'str'}}
>> 
>> Note that my proposed doc string has the members in a different order.
>> Recommend to use the same order here.
>> 
>>>>> +
>>>>>    ##
>>>>>    # @MEMORY_FAILURE:
>>>>>    #
>>>>
>>>> [...]
>>>>
>>
Xiaoyao Li Feb. 27, 2024, 2:51 p.m. UTC | #6
On 2/27/2024 9:09 PM, Markus Armbruster wrote:
[...]
>>>>>> @@ -566,6 +569,27 @@
>>>>>>               'psw-addr': 'uint64',
>>>>>>               'reason': 'S390CrashReason'}}
>>>>>>     
>>>>>> +##
>>>>>> +# @GuestPanicInformationTdx:
>>>>>> +#
>>>>>> +# TDX Guest panic information specific to TDX GCHI
>>>>>> +# TDG.VP.VMCALL<ReportFatalError>.
>>>>>> +#
>>>>>> +# @error-code: TD-specific error code
>>>>>
>>>>> Where could a user find information on these error codes?
>>>>
>>>> TDX GHCI (Guset-host-communication-Interface)spec. It defines all the
>>>> TDVMCALL leaves.
>>>>
>>>> 0: panic;
>>>> 0x1 - 0xffffffff: reserved.
>>>
>>> Would it make sense to add a reference?
>>
>> https://cdrdv2.intel.com/v1/dl/getContent/726792
> 
> URLs have this annoying tendency to rot.
> 
> What about
> 
> # @error-code: Error code as defined in "Guest-Hypervisor Communication
> #     Interface (GHCI) Specification for Intel TDX 1.5"

I think it gets mentioned at the beginning of @GuestPanicInformationTdx

   TDX Guest panic information specific to TDX GHCI
   TDG.VP.VMCALL<ReportFatalError>.

Do we still to mention it in every single member?

>>>>>> +#
>>>>>> +# @gpa: guest-physical address of a page that contains additional
>>>>>> +#     error data, in forms of zero-terminated string.
>>>>>
>>>>> "in the form of a zero-terminated string"
>>>>
>>>> fixed.
>>>>
>>>>>> +#
>>>>>> +# @message: Human-readable error message provided by the guest. Not
>>>>>> +#     to be trusted.
>>>>>
>>>>> How is this message related to the one pointed to by @gpa?
>>>>
>>>> In general, @message contains a brief message of the error. While @gpa
>>>> (when valid) contains a verbose message.
>>>>
>>>> The reason why we need both is because sometime when TD guest hits a
>>>> fatal error, its memory may get corrupted so we cannot pass information
>>>> via @gpa. Information in @message is passed through GPRs.
>>>
>>> Well, we do pass information via @gpa, always.  I guess it page's
>>> contents can be corrupted.
>>
>> No. It's not always. the bit 63 of the error code is "GPA valid" bit.
>> @gpa is valid only when bit 63 of error code is 1.
>>
>> And current Linux TD guest implementation doesn't use @gpa at all.
>> https://github.com/torvalds/linux/blob/45ec2f5f6ed3ec3a79ba1329ad585497cdcbe663/arch/x86/coco/tdx/tdx.c#L131
> 
> Aha!
> 
> Why would we want to include @gpa when the "GPA valid" bit is off?
> 
> If we do want it, then
> 
> # @gpa: guest-physical address of a page that contains more verbose
> #     error information, as zero-terminated string.  Valid when the
> #     "GPA valid" bit is set in @error-code.
> 
> If we don't, then make @gpa optional, present when valid, and document
> it like this:
> 
> # @gpa: guest-physical address of a page that contains more verbose
> #     error information, as zero-terminated string.  Present when the
> #     "GPA valid" bit is set in @error-code.

I will go this direction.

thanks!

>>> Perhaps something like
>>>
>>>       # @message: Human-readable error message provided by the guest.  Not
>>>       #     to be trusted.
>>>       #
>>>       # @gpa: guest-physical address of a page that contains more verbose
>>>       #     error information, as zero-terminated string.  Note that guest
>>>       #     memory corruption can corrupt the page's contents.
>>>
>>>>>> +#
>>>>>> +# Since: 9.0
>>>>>> +##
>>>>>> +{'struct': 'GuestPanicInformationTdx',
>>>>>> + 'data': {'error-code': 'uint64',
>>>>>> +          'gpa': 'uint64',
>>>>>> +          'message': 'str'}}
>>>
>>> Note that my proposed doc string has the members in a different order.
>>> Recommend to use the same order here.
>>>
>>>>>> +
>>>>>>     ##
>>>>>>     # @MEMORY_FAILURE:
>>>>>>     #
>>>>>
>>>>> [...]
>>>>>
>>>
> 
>
Markus Armbruster Feb. 27, 2024, 3:42 p.m. UTC | #7
Xiaoyao Li <xiaoyao.li@intel.com> writes:

> On 2/27/2024 9:09 PM, Markus Armbruster wrote:
> [...]
>>>>>>> @@ -566,6 +569,27 @@
>>>>>>>               'psw-addr': 'uint64',
>>>>>>>               'reason': 'S390CrashReason'}}
>>>>>>>     
>>>>>>> +##
>>>>>>> +# @GuestPanicInformationTdx:
>>>>>>> +#
>>>>>>> +# TDX Guest panic information specific to TDX GCHI
>>>>>>> +# TDG.VP.VMCALL<ReportFatalError>.
>>>>>>> +#
>>>>>>> +# @error-code: TD-specific error code
>>>>>>
>>>>>> Where could a user find information on these error codes?
>>>>>
>>>>> TDX GHCI (Guset-host-communication-Interface)spec. It defines all the
>>>>> TDVMCALL leaves.
>>>>>
>>>>> 0: panic;
>>>>> 0x1 - 0xffffffff: reserved.
>>>>
>>>> Would it make sense to add a reference?
>>>
>>> https://cdrdv2.intel.com/v1/dl/getContent/726792
>> 
>> URLs have this annoying tendency to rot.
>> 
>> What about
>> 
>> # @error-code: Error code as defined in "Guest-Hypervisor Communication
>> #     Interface (GHCI) Specification for Intel TDX 1.5"
>
> I think it gets mentioned at the beginning of @GuestPanicInformationTdx
>
>    TDX Guest panic information specific to TDX GHCI
>    TDG.VP.VMCALL<ReportFatalError>.
>
> Do we still to mention it in every single member?

No, I didn't recognize the alphabet soup there as a reference :)

Let me try again:

##
# @GuestPanicInformationTdx:
#
# TDX Guest panic information specific to TDX, as specified in the
# "Guest-Hypervisor Communication Interface (GHCI) Specification",
# section TDG.VP.VMCALL<ReportFatalError>.
#
# @error-code: TD-specific error code
#
[...]
#
# Since: 9.0
##

>>>>>>> +#
>>>>>>> +# @gpa: guest-physical address of a page that contains additional
>>>>>>> +#     error data, in forms of zero-terminated string.
>>>>>>
>>>>>> "in the form of a zero-terminated string"
>>>>>
>>>>> fixed.
>>>>>
>>>>>>> +#
>>>>>>> +# @message: Human-readable error message provided by the guest. Not
>>>>>>> +#     to be trusted.
>>>>>>
>>>>>> How is this message related to the one pointed to by @gpa?
>>>>>
>>>>> In general, @message contains a brief message of the error. While @gpa
>>>>> (when valid) contains a verbose message.
>>>>>
>>>>> The reason why we need both is because sometime when TD guest hits a
>>>>> fatal error, its memory may get corrupted so we cannot pass information
>>>>> via @gpa. Information in @message is passed through GPRs.
>>>>
>>>> Well, we do pass information via @gpa, always.  I guess it page's
>>>> contents can be corrupted.
>>>
>>> No. It's not always. the bit 63 of the error code is "GPA valid" bit.
>>> @gpa is valid only when bit 63 of error code is 1.
>>>
>>> And current Linux TD guest implementation doesn't use @gpa at all.
>>> https://github.com/torvalds/linux/blob/45ec2f5f6ed3ec3a79ba1329ad585497cdcbe663/arch/x86/coco/tdx/tdx.c#L131
>> 
>> Aha!
>> 
>> Why would we want to include @gpa when the "GPA valid" bit is off?
>> 
>> If we do want it, then
>> 
>> # @gpa: guest-physical address of a page that contains more verbose
>> #     error information, as zero-terminated string.  Valid when the
>> #     "GPA valid" bit is set in @error-code.
>> 
>> If we don't, then make @gpa optional, present when valid, and document
>> it like this:
>> 
>> # @gpa: guest-physical address of a page that contains more verbose
>> #     error information, as zero-terminated string.  Present when the
>> #     "GPA valid" bit is set in @error-code.
>
> I will go this direction.
>
> thanks!

You're welcome!

[...]
diff mbox series

Patch

diff --git a/qapi/run-state.json b/qapi/run-state.json
index 08bc99cb8561..5429116679e3 100644
--- a/qapi/run-state.json
+++ b/qapi/run-state.json
@@ -485,10 +485,12 @@ 
 #
 # @s390: s390 guest panic information type (Since: 2.12)
 #
+# @tdx: tdx guest panic information type (Since: 8.2)
+#
 # Since: 2.9
 ##
 { 'enum': 'GuestPanicInformationType',
-  'data': [ 'hyper-v', 's390' ] }
+  'data': [ 'hyper-v', 's390', 'tdx' ] }
 
 ##
 # @GuestPanicInformation:
@@ -503,7 +505,8 @@ 
  'base': {'type': 'GuestPanicInformationType'},
  'discriminator': 'type',
  'data': {'hyper-v': 'GuestPanicInformationHyperV',
-          's390': 'GuestPanicInformationS390'}}
+          's390': 'GuestPanicInformationS390',
+          'tdx' : 'GuestPanicInformationTdx'}}
 
 ##
 # @GuestPanicInformationHyperV:
@@ -566,6 +569,27 @@ 
           'psw-addr': 'uint64',
           'reason': 'S390CrashReason'}}
 
+##
+# @GuestPanicInformationTdx:
+#
+# TDX Guest panic information specific to TDX GCHI
+# TDG.VP.VMCALL<ReportFatalError>.
+#
+# @error-code: TD-specific error code
+#
+# @gpa: guest-physical address of a page that contains additional
+#     error data, in forms of zero-terminated string.
+#
+# @message: Human-readable error message provided by the guest. Not
+#     to be trusted.
+#
+# Since: 9.0
+##
+{'struct': 'GuestPanicInformationTdx',
+ 'data': {'error-code': 'uint64',
+          'gpa': 'uint64',
+          'message': 'str'}}
+
 ##
 # @MEMORY_FAILURE:
 #
diff --git a/system/runstate.c b/system/runstate.c
index d6ab860ecaa7..1ae85ea2c345 100644
--- a/system/runstate.c
+++ b/system/runstate.c
@@ -519,6 +519,52 @@  static void qemu_system_wakeup(void)
     }
 }
 
+static char* tdx_parse_panic_message(char *message)
+{
+    bool printable = false;
+    char *buf = NULL;
+    int len = 0, i;
+
+    /*
+     * Although message is defined as a json string, we shouldn't
+     * unconditionally treat it as is because the guest generated it and
+     * it's not necessarily trustable.
+     */
+    if (message) {
+        /* The caller guarantees the NUL-terminated string. */
+        len = strlen(message);
+
+        printable = len > 0;
+        for (i = 0; i < len; i++) {
+            if (!(0x20 <= message[i] && message[i] <= 0x7e)) {
+                printable = false;
+                break;
+            }
+        }
+    }
+
+    if (!printable && len) {
+        /* 3 = length of "%02x " */
+        buf = g_malloc(len * 3);
+        for (i = 0; i < len; i++) {
+            if (message[i] == '\0') {
+                break;
+            } else {
+                sprintf(buf + 3 * i, "%02x ", message[i]);
+            }
+        }
+        if (i > 0)
+            /* replace the last ' '(space) to NUL */
+            buf[i * 3 - 1] = '\0';
+        else
+            buf[0] = '\0';
+
+        return buf;
+    }
+
+    return message;
+}
+
 void qemu_system_guest_panicked(GuestPanicInformation *info)
 {
     qemu_log_mask(LOG_GUEST_ERROR, "Guest crashed");
@@ -560,7 +606,15 @@  void qemu_system_guest_panicked(GuestPanicInformation *info)
                           S390CrashReason_str(info->u.s390.reason),
                           info->u.s390.psw_mask,
                           info->u.s390.psw_addr);
+        } else if (info->type == GUEST_PANIC_INFORMATION_TYPE_TDX) {
+            qemu_log_mask(LOG_GUEST_ERROR,
+                          " TDX guest reports fatal error:\"%s\""
+                          " error code: 0x%016" PRIx64 " gpa page: 0x%016" PRIx64 "\n",
+                          tdx_parse_panic_message(info->u.tdx.message),
+                          info->u.tdx.error_code,
+                          info->u.tdx.gpa);
         }
+
         qapi_free_GuestPanicInformation(info);
     }
 }
diff --git a/target/i386/kvm/tdx.c b/target/i386/kvm/tdx.c
index 1c79032ca262..4fbb18135951 100644
--- a/target/i386/kvm/tdx.c
+++ b/target/i386/kvm/tdx.c
@@ -20,6 +20,7 @@ 
 #include "qom/object_interfaces.h"
 #include "standard-headers/asm-x86/kvm_para.h"
 #include "sysemu/kvm.h"
+#include "sysemu/runstate.h"
 #include "sysemu/sysemu.h"
 #include "exec/address-spaces.h"
 #include "exec/ramblock.h"
@@ -1078,11 +1079,26 @@  static int tdx_handle_get_quote(X86CPU *cpu, struct kvm_tdx_vmcall *vmcall)
     return 0;
 }
 
+static void tdx_panicked_on_fatal_error(X86CPU *cpu, uint64_t error_code,
+                                        uint64_t gpa, char *message)
+{
+    GuestPanicInformation *panic_info;
+
+    panic_info = g_new0(GuestPanicInformation, 1);
+    panic_info->type = GUEST_PANIC_INFORMATION_TYPE_TDX;
+    panic_info->u.tdx.error_code = error_code;
+    panic_info->u.tdx.gpa = gpa;
+    panic_info->u.tdx.message = message;
+
+    qemu_system_guest_panicked(panic_info);
+}
+
 static int tdx_handle_report_fatal_error(X86CPU *cpu,
                                          struct kvm_tdx_vmcall *vmcall)
 {
     uint64_t error_code = vmcall->in_r12;
     char *message = NULL;
+    uint64_t gpa = -1ull;
 
     if (error_code & 0xffff) {
         error_report("TDX: REPORT_FATAL_ERROR: invalid error code: "
@@ -1111,7 +1127,13 @@  static int tdx_handle_report_fatal_error(X86CPU *cpu,
         assert((char *)tmp == message + GUEST_PANIC_INFO_TDX_MESSAGE_MAX);
     }
 
-    error_report("TD guest reports fatal error. %s\n", message ? : "");
+#define TDX_REPORT_FATAL_ERROR_GPA_VALID    BIT_ULL(63)
+    if (error_code & TDX_REPORT_FATAL_ERROR_GPA_VALID) {
+        gpa = vmcall->in_r13;
+    }
+
+    tdx_panicked_on_fatal_error(cpu, error_code, gpa, message);
+
     return -1;
 }