diff mbox series

[v1,1/5] KVM: SEV: Provide support for SNP_GUEST_REQUEST NAE event

Message ID 20240621134041.3170480-2-michael.roth@amd.com (mailing list archive)
State New, archived
Headers show
Series SEV-SNP: Add KVM support for attestation and KVM_EXIT_COCO | expand

Commit Message

Michael Roth June 21, 2024, 1:40 p.m. UTC
From: Brijesh Singh <brijesh.singh@amd.com>

Version 2 of GHCB specification added support for the SNP Guest Request
Message NAE event. The event allows for an SEV-SNP guest to make
requests to the SEV-SNP firmware through hypervisor using the
SNP_GUEST_REQUEST API defined in the SEV-SNP firmware specification.

This is used by guests primarily to request attestation reports from
firmware. There are other request types are available as well, but the
specifics of what guest requests are being made are opaque to the
hypervisor, which only serves as a proxy for the guest requests and
firmware responses.

Implement handling for these events.

Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Co-developed-by: Alexey Kardashevskiy <aik@amd.com>
Signed-off-by: Alexey Kardashevskiy <aik@amd.com>
Co-developed-by: Ashish Kalra <ashish.kalra@amd.com>
Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
Reviewed-by: Tom Lendacky <thomas.lendacky@amd.com>
[mdr: ensure FW command failures are indicated to guest, drop extended
 request handling to be re-written as separate patch, massage commit]
Signed-off-by: Michael Roth <michael.roth@amd.com>
Message-ID: <20240501085210.2213060-19-michael.roth@amd.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/svm/sev.c         | 69 ++++++++++++++++++++++++++++++++++
 include/uapi/linux/sev-guest.h |  9 +++++
 2 files changed, 78 insertions(+)

Comments

Liam Merwick June 21, 2024, 3:52 p.m. UTC | #1
On 21/06/2024 14:40, Michael Roth wrote:
> From: Brijesh Singh <brijesh.singh@amd.com>
> 
> Version 2 of GHCB specification added support for the SNP Guest Request
> Message NAE event. The event allows for an SEV-SNP guest to make
> requests to the SEV-SNP firmware through hypervisor using the
> SNP_GUEST_REQUEST API defined in the SEV-SNP firmware specification.
> 
> This is used by guests primarily to request attestation reports from
> firmware. There are other request types are available as well, but the
> specifics of what guest requests are being made are opaque to the
> hypervisor, which only serves as a proxy for the guest requests and
> firmware responses.
> 
> Implement handling for these events.
> 
> Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
> Co-developed-by: Alexey Kardashevskiy <aik@amd.com>
> Signed-off-by: Alexey Kardashevskiy <aik@amd.com>
> Co-developed-by: Ashish Kalra <ashish.kalra@amd.com>
> Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
> Reviewed-by: Tom Lendacky <thomas.lendacky@amd.com>
> [mdr: ensure FW command failures are indicated to guest, drop extended
>   request handling to be re-written as separate patch, massage commit]
> Signed-off-by: Michael Roth <michael.roth@amd.com>
> Message-ID: <20240501085210.2213060-19-michael.roth@amd.com>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>   arch/x86/kvm/svm/sev.c         | 69 ++++++++++++++++++++++++++++++++++
>   include/uapi/linux/sev-guest.h |  9 +++++
>   2 files changed, 78 insertions(+)
> 
> diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
> index df8818759698..7338b987cadd 100644
> --- a/arch/x86/kvm/svm/sev.c
> +++ b/arch/x86/kvm/svm/sev.c
> @@ -19,6 +19,7 @@
>   #include <linux/misc_cgroup.h>
>   #include <linux/processor.h>
>   #include <linux/trace_events.h>
> +#include <uapi/linux/sev-guest.h>
>   
>   #include <asm/pkru.h>
>   #include <asm/trapnr.h>
> @@ -3321,6 +3322,10 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm)
>   		if (!sev_snp_guest(vcpu->kvm) || !kvm_ghcb_sw_scratch_is_valid(svm))
>   			goto vmgexit_err;
>   		break;
> +	case SVM_VMGEXIT_GUEST_REQUEST:
> +		if (!sev_snp_guest(vcpu->kvm))
> +			goto vmgexit_err;
> +		break;
>   	default:
>   		reason = GHCB_ERR_INVALID_EVENT;
>   		goto vmgexit_err;
> @@ -3939,6 +3944,67 @@ static int sev_snp_ap_creation(struct vcpu_svm *svm)
>   	return ret;
>   }
>   
> +static int snp_handle_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t resp_gpa)
> +{
> +	struct sev_data_snp_guest_request data = {0};
> +	struct kvm *kvm = svm->vcpu.kvm;
> +	kvm_pfn_t req_pfn, resp_pfn;
> +	sev_ret_code fw_err = 0;
> +	int ret;
> +
> +	if (!sev_snp_guest(kvm) || !PAGE_ALIGNED(req_gpa) || !PAGE_ALIGNED(resp_gpa))
> +		return -EINVAL;
> +
> +	req_pfn = gfn_to_pfn(kvm, gpa_to_gfn(req_gpa));
> +	if (is_error_noslot_pfn(req_pfn))
> +		return -EINVAL;
> +
> +	resp_pfn = gfn_to_pfn(kvm, gpa_to_gfn(resp_gpa));
> +	if (is_error_noslot_pfn(resp_pfn)) {
> +		ret = EINVAL;
> +		goto release_req;
> +	}
> +
> +	if (rmp_make_private(resp_pfn, 0, PG_LEVEL_4K, 0, true)) {
> +		ret = -EINVAL;
> +		kvm_release_pfn_clean(resp_pfn);
> +		goto release_req;
> +	}
> +
> +	data.gctx_paddr = __psp_pa(to_kvm_sev_info(kvm)->snp_context);
> +	data.req_paddr = __sme_set(req_pfn << PAGE_SHIFT);
> +	data.res_paddr = __sme_set(resp_pfn << PAGE_SHIFT);
> +
> +	ret = sev_issue_cmd(kvm, SEV_CMD_SNP_GUEST_REQUEST, &data, &fw_err);
> +	if (ret)
> +		return ret;


Should this be goto release_req; ?
Does resp_pfn need to be dealt with as well?

> +
> +	/*
> +	 * If reclaim fails then there's a good chance the guest will no longer
> +	 * be runnable so just let userspace terminate the guest.
> +	 */5
> +	if (snp_page_reclaim(kvm, resp_pfn)) {
> +		return -EIO;


Should this be setting ret = -EIO ? Next line is unreachable.
Does resp_pfn need to be dealt with as well?


> +		goto release_req;
> +	}
> +
> +	/*
> +	 * As per GHCB spec, firmware failures should be communicated back to
> +	 * the guest via SW_EXITINFO2 rather than be treated as immediately
> +	 * fatal.
> +	 */
> +	ghcb_set_sw_exit_info_2(svm->sev_es.ghcb,
> +				SNP_GUEST_ERR(ret ? SNP_GUEST_VMM_ERR_GENERIC : 0,
> +					      fw_err));
> +
> +	ret = 1; /* resume guest */
> +	kvm_release_pfn_dirty(resp_pfn);
> +
> +release_req:
> +	kvm_release_pfn_clean(req_pfn);
> +	return ret;
> +}
> +
>   static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
>   {
>   	struct vmcb_control_area *control = &svm->vmcb->control;
> @@ -4213,6 +4279,9 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
>   
>   		ret = 1;
>   		break;
> +	case SVM_VMGEXIT_GUEST_REQUEST:
> +		ret = snp_handle_guest_req(svm, control->exit_info_1, control->exit_info_2);
> +		break;
>   	case SVM_VMGEXIT_UNSUPPORTED_EVENT:
>   		vcpu_unimpl(vcpu,
>   			    "vmgexit: unsupported event - exit_info_1=%#llx, exit_info_2=%#llx\n",
> diff --git a/include/uapi/linux/sev-guest.h b/include/uapi/linux/sev-guest.h
> index 154a87a1eca9..7bd78e258569 100644
> --- a/include/uapi/linux/sev-guest.h
> +++ b/include/uapi/linux/sev-guest.h
> @@ -89,8 +89,17 @@ struct snp_ext_report_req {
>   #define SNP_GUEST_FW_ERR_MASK		GENMASK_ULL(31, 0)
>   #define SNP_GUEST_VMM_ERR_SHIFT		32
>   #define SNP_GUEST_VMM_ERR(x)		(((u64)x) << SNP_GUEST_VMM_ERR_SHIFT)
> +#define SNP_GUEST_FW_ERR(x)		((x) & SNP_GUEST_FW_ERR_MASK)
> +#define SNP_GUEST_ERR(vmm_err, fw_err)	(SNP_GUEST_VMM_ERR(vmm_err) | \
> +					 SNP_GUEST_FW_ERR(fw_err))
>   
> +/*
> + * The GHCB spec only formally defines INVALID_LEN/BUSY VMM errors, but define
> + * a GENERIC error code such that it won't ever conflict with GHCB-defined
> + * errors if any get added in the future.
> + */
>   #define SNP_GUEST_VMM_ERR_INVALID_LEN	1
>   #define SNP_GUEST_VMM_ERR_BUSY		2
> +#define SNP_GUEST_VMM_ERR_GENERIC	BIT(31)
>   
>   #endif /* __UAPI_LINUX_SEV_GUEST_H_ */

Regards,
Liam
Michael Roth June 21, 2024, 4:17 p.m. UTC | #2
On Fri, Jun 21, 2024 at 03:52:35PM +0000, Liam Merwick wrote:
> On 21/06/2024 14:40, Michael Roth wrote:
> > From: Brijesh Singh <brijesh.singh@amd.com>
> > 
> > Version 2 of GHCB specification added support for the SNP Guest Request
> > Message NAE event. The event allows for an SEV-SNP guest to make
> > requests to the SEV-SNP firmware through hypervisor using the
> > SNP_GUEST_REQUEST API defined in the SEV-SNP firmware specification.
> > 
> > This is used by guests primarily to request attestation reports from
> > firmware. There are other request types are available as well, but the
> > specifics of what guest requests are being made are opaque to the
> > hypervisor, which only serves as a proxy for the guest requests and
> > firmware responses.
> > 
> > Implement handling for these events.
> > 
> > Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
> > Co-developed-by: Alexey Kardashevskiy <aik@amd.com>
> > Signed-off-by: Alexey Kardashevskiy <aik@amd.com>
> > Co-developed-by: Ashish Kalra <ashish.kalra@amd.com>
> > Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
> > Reviewed-by: Tom Lendacky <thomas.lendacky@amd.com>
> > [mdr: ensure FW command failures are indicated to guest, drop extended
> >   request handling to be re-written as separate patch, massage commit]
> > Signed-off-by: Michael Roth <michael.roth@amd.com>
> > Message-ID: <20240501085210.2213060-19-michael.roth@amd.com>
> > Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> > ---
> >   arch/x86/kvm/svm/sev.c         | 69 ++++++++++++++++++++++++++++++++++
> >   include/uapi/linux/sev-guest.h |  9 +++++
> >   2 files changed, 78 insertions(+)
> > 
> > diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
> > index df8818759698..7338b987cadd 100644
> > --- a/arch/x86/kvm/svm/sev.c
> > +++ b/arch/x86/kvm/svm/sev.c
> > @@ -19,6 +19,7 @@
> >   #include <linux/misc_cgroup.h>
> >   #include <linux/processor.h>
> >   #include <linux/trace_events.h>
> > +#include <uapi/linux/sev-guest.h>
> >   
> >   #include <asm/pkru.h>
> >   #include <asm/trapnr.h>
> > @@ -3321,6 +3322,10 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm)
> >   		if (!sev_snp_guest(vcpu->kvm) || !kvm_ghcb_sw_scratch_is_valid(svm))
> >   			goto vmgexit_err;
> >   		break;
> > +	case SVM_VMGEXIT_GUEST_REQUEST:
> > +		if (!sev_snp_guest(vcpu->kvm))
> > +			goto vmgexit_err;
> > +		break;
> >   	default:
> >   		reason = GHCB_ERR_INVALID_EVENT;
> >   		goto vmgexit_err;
> > @@ -3939,6 +3944,67 @@ static int sev_snp_ap_creation(struct vcpu_svm *svm)
> >   	return ret;
> >   }
> >   
> > +static int snp_handle_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t resp_gpa)
> > +{
> > +	struct sev_data_snp_guest_request data = {0};
> > +	struct kvm *kvm = svm->vcpu.kvm;
> > +	kvm_pfn_t req_pfn, resp_pfn;
> > +	sev_ret_code fw_err = 0;
> > +	int ret;
> > +
> > +	if (!sev_snp_guest(kvm) || !PAGE_ALIGNED(req_gpa) || !PAGE_ALIGNED(resp_gpa))
> > +		return -EINVAL;
> > +
> > +	req_pfn = gfn_to_pfn(kvm, gpa_to_gfn(req_gpa));
> > +	if (is_error_noslot_pfn(req_pfn))
> > +		return -EINVAL;
> > +
> > +	resp_pfn = gfn_to_pfn(kvm, gpa_to_gfn(resp_gpa));
> > +	if (is_error_noslot_pfn(resp_pfn)) {
> > +		ret = EINVAL;
> > +		goto release_req;
> > +	}
> > +
> > +	if (rmp_make_private(resp_pfn, 0, PG_LEVEL_4K, 0, true)) {
> > +		ret = -EINVAL;
> > +		kvm_release_pfn_clean(resp_pfn);
> > +		goto release_req;
> > +	}
> > +
> > +	data.gctx_paddr = __psp_pa(to_kvm_sev_info(kvm)->snp_context);
> > +	data.req_paddr = __sme_set(req_pfn << PAGE_SHIFT);
> > +	data.res_paddr = __sme_set(resp_pfn << PAGE_SHIFT);
> > +
> > +	ret = sev_issue_cmd(kvm, SEV_CMD_SNP_GUEST_REQUEST, &data, &fw_err);
> > +	if (ret)
> > +		return ret;
> 
> 
> Should this be goto release_req; ?
> Does resp_pfn need to be dealt with as well?

Argh... yes. I folded some helper functions into here and missed updating
some of the return sites. I'll respond to this patch with a revised version.
Sorry for the noise.

-Mike
Sean Christopherson June 26, 2024, 1:58 p.m. UTC | #3
On Fri, Jun 21, 2024, Michael Roth wrote:
> @@ -3939,6 +3944,67 @@ static int sev_snp_ap_creation(struct vcpu_svm *svm)
>  	return ret;
>  }
>  
> +static int snp_handle_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t resp_gpa)
> +{
> +	struct sev_data_snp_guest_request data = {0};
> +	struct kvm *kvm = svm->vcpu.kvm;
> +	kvm_pfn_t req_pfn, resp_pfn;
> +	sev_ret_code fw_err = 0;
> +	int ret;
> +
> +	if (!sev_snp_guest(kvm) || !PAGE_ALIGNED(req_gpa) || !PAGE_ALIGNED(resp_gpa))
> +		return -EINVAL;
> +
> +	req_pfn = gfn_to_pfn(kvm, gpa_to_gfn(req_gpa));

This is going to sound odd, but I think we should use gfn_to_page(), i.e. require
that the page be a refcounted page so that it can be pinned.  Long story short,
one of my learnings from the whole non-refcounted pages saga is that doing DMA
to unpinned pages outside of mmu_lock is wildly unsafe, and for all intents and
purposes the ASP is a device doing DMA.  I'm also pretty sure KVM should actually
*pin* pages, as in FOLL_PIN, but I'm ok tackling that later.

For now, using gfn_to_pages() would avoid creating ABI (DMA to VM_PFNMAP and/or
VM_MIXEDMAP memory) that KVM probably doesn't want to support in the long term.

[*] https://lore.kernel.org/all/20240229025759.1187910-1-stevensd@google.com

> +	if (is_error_noslot_pfn(req_pfn))
> +		return -EINVAL;
> +
> +	resp_pfn = gfn_to_pfn(kvm, gpa_to_gfn(resp_gpa));
> +	if (is_error_noslot_pfn(resp_pfn)) {
> +		ret = EINVAL;
> +		goto release_req;
> +	}
> +
> +	if (rmp_make_private(resp_pfn, 0, PG_LEVEL_4K, 0, true)) {
> +		ret = -EINVAL;
> +		kvm_release_pfn_clean(resp_pfn);
> +		goto release_req;
> +	}

I don't see how this is safe.  KVM holds no locks, i.e. can't guarantee that the
resp_pfn stays private for the duration of the operation.  And on the opposite
side, KVM can't guarantee that resp_pfn isn't being actively used by something
in the kernel, e.g. KVM might induce an unexpected #PF(RMP).

Why can't KVM require that the response/destination page already be private?  I'm
also somewhat confused by the reclaim below.  If KVM converts the response page
back to shared, doesn't that clobber the data?  If so, how does the guest actually
get the response?  I feel like I'm missing something.

Regardless of whether or not I'm missing something, this needs comments, and the
changelog needs to be rewritten with --verbose to explain what's going on.  

> +	data.gctx_paddr = __psp_pa(to_kvm_sev_info(kvm)->snp_context);
> +	data.req_paddr = __sme_set(req_pfn << PAGE_SHIFT);
> +	data.res_paddr = __sme_set(resp_pfn << PAGE_SHIFT);
> +
> +	ret = sev_issue_cmd(kvm, SEV_CMD_SNP_GUEST_REQUEST, &data, &fw_err);
> +	if (ret)
> +		return ret;

This leaks both req_pfn and resp_pfn.

> +
> +	/*
> +	 * If reclaim fails then there's a good chance the guest will no longer
> +	 * be runnable so just let userspace terminate the guest.
> +	 */
> +	if (snp_page_reclaim(kvm, resp_pfn)) {
> +		return -EIO;
> +		goto release_req;
> +	}
> +
> +	/*
> +	 * As per GHCB spec, firmware failures should be communicated back to
> +	 * the guest via SW_EXITINFO2 rather than be treated as immediately
> +	 * fatal.
> +	 */
> +	ghcb_set_sw_exit_info_2(svm->sev_es.ghcb,
> +				SNP_GUEST_ERR(ret ? SNP_GUEST_VMM_ERR_GENERIC : 0,
> +					      fw_err));
> +
> +	ret = 1; /* resume guest */
> +	kvm_release_pfn_dirty(resp_pfn);
> +
> +release_req:
> +	kvm_release_pfn_clean(req_pfn);
> +	return ret;
> +}
Michael Roth June 26, 2024, 3:45 p.m. UTC | #4
On Wed, Jun 26, 2024 at 06:58:09AM -0700, Sean Christopherson wrote:
> On Fri, Jun 21, 2024, Michael Roth wrote:
> > @@ -3939,6 +3944,67 @@ static int sev_snp_ap_creation(struct vcpu_svm *svm)
> >  	return ret;
> >  }
> >  
> > +static int snp_handle_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t resp_gpa)
> > +{
> > +	struct sev_data_snp_guest_request data = {0};
> > +	struct kvm *kvm = svm->vcpu.kvm;
> > +	kvm_pfn_t req_pfn, resp_pfn;
> > +	sev_ret_code fw_err = 0;
> > +	int ret;
> > +
> > +	if (!sev_snp_guest(kvm) || !PAGE_ALIGNED(req_gpa) || !PAGE_ALIGNED(resp_gpa))
> > +		return -EINVAL;
> > +
> > +	req_pfn = gfn_to_pfn(kvm, gpa_to_gfn(req_gpa));
> 
> This is going to sound odd, but I think we should use gfn_to_page(), i.e. require
> that the page be a refcounted page so that it can be pinned.  Long story short,
> one of my learnings from the whole non-refcounted pages saga is that doing DMA
> to unpinned pages outside of mmu_lock is wildly unsafe, and for all intents and
> purposes the ASP is a device doing DMA.  I'm also pretty sure KVM should actually
> *pin* pages, as in FOLL_PIN, but I'm ok tackling that later.
> 
> For now, using gfn_to_pages() would avoid creating ABI (DMA to VM_PFNMAP and/or
> VM_MIXEDMAP memory) that KVM probably doesn't want to support in the long term.

That seems reasonable. Will switch this to gfn_to_page().

> 
> [*] https://lore.kernel.org/all/20240229025759.1187910-1-stevensd@google.com
> 
> > +	if (is_error_noslot_pfn(req_pfn))
> > +		return -EINVAL;
> > +
> > +	resp_pfn = gfn_to_pfn(kvm, gpa_to_gfn(resp_gpa));
> > +	if (is_error_noslot_pfn(resp_pfn)) {
> > +		ret = EINVAL;
> > +		goto release_req;
> > +	}
> > +
> > +	if (rmp_make_private(resp_pfn, 0, PG_LEVEL_4K, 0, true)) {
> > +		ret = -EINVAL;
> > +		kvm_release_pfn_clean(resp_pfn);
> > +		goto release_req;
> > +	}
> 
> I don't see how this is safe.  KVM holds no locks, i.e. can't guarantee that the
> resp_pfn stays private for the duration of the operation.  And on the opposite

When the page is set to private with asid=0,immutable=true arguments,
this puts the page in a special 'firmware-owned' state that specifically
to avoid any changes to the page state happening from under the ASPs feet.
The only way to switch the page to any other state at this point is to
issue the SEV_CMD_SNP_PAGE_RECLAIM request to the ASP via
snp_page_reclaim().

I could see the guest shooting itself in the foot by issuing 2 guest
requests with the same req_pfn/resp_pfn, but on the KVM side whichever
request issues rmp_make_private() first would succeed, and then the
2nd request would generate an EINVAL to userspace.

In that sense, rmp_make_private()/snp_page_reclaim() sort of pair to
lock/unlock a page that's being handed to the ASP. But this should be
better documented either way.

> resp_pfn stays private for the duration of the operation.  And on the opposite
> side, KVM can't guarantee that resp_pfn isn't being actively used by something
> in the kernel, e.g. KVM might induce an unexpected #PF(RMP).
> 
> Why can't KVM require that the response/destination page already be private?  I'm

Hmm. This is a bit tricky. The GHCB spec states:

  The Guest Request NAE event requires two unique pages, one page for the
  request and one page for the response. Both pages must be assigned to the
  hypervisor (shared). The guest must supply the guest physical address of
  the pages (i.e., page aligned) as input.

  The hypervisor must translate the guest physical address (GPA) of each
  page into a system physical address (SPA). The SPA is used to verify that
  the request and response pages are assigned to the hypervisor.

At least for req_pfn, this makes sense because the header of the message
is actually plain text, and KVM does need to parse it to read the message
type from the header. It's just the req/resp payload that are encrypted
by the guest/firmware, i.e. it's not relying on hardware-based memory
encryption in this case.

Because of that though, I think we could potential address this by
allocating the req/resp page as separate pages outside of guest memory,
and simply copy the contents to/from the GPAs provided by the guest.
I'll look more into this approach.

If we go that route I think some of the concerns above go away as well,
but we might still need to a lock or something to serialize access to
these separate/intermediate pages to avoid needed to allocate them every
time or per-request.

> also somewhat confused by the reclaim below.  If KVM converts the response page
> back to shared, doesn't that clobber the data?  If so, how does the guest actually
> get the response?  I feel like I'm missing something.

In this case we're just setting it immutable/firmware-owned, which just
happens to be equivalent (in terms of the RMP table) to a guest-owned page,
but with rmp_entry.ASID=0/rmp_entry.immutable=true instead of
rmp_entry.ASID=<guest asid>/rmp_entry.immutable=false. So the contents remain
intact/readable after the reclaim.

> 
> Regardless of whether or not I'm missing something, this needs comments, and the
> changelog needs to be rewritten with --verbose to explain what's going on.  

Sure, will add more comments and details in the commit msg for v2.

> 
> > +	data.gctx_paddr = __psp_pa(to_kvm_sev_info(kvm)->snp_context);
> > +	data.req_paddr = __sme_set(req_pfn << PAGE_SHIFT);
> > +	data.res_paddr = __sme_set(resp_pfn << PAGE_SHIFT);
> > +
> > +	ret = sev_issue_cmd(kvm, SEV_CMD_SNP_GUEST_REQUEST, &data, &fw_err);
> > +	if (ret)
> > +		return ret;
> 
> This leaks both req_pfn and resp_pfn.

Yes, this one should be fixed up by the v1-revised version of the patch.

Thanks!

-Mike

> 
> > +
> > +	/*
> > +	 * If reclaim fails then there's a good chance the guest will no longer
> > +	 * be runnable so just let userspace terminate the guest.
> > +	 */
> > +	if (snp_page_reclaim(kvm, resp_pfn)) {
> > +		return -EIO;
> > +		goto release_req;
> > +	}
> > +
> > +	/*
> > +	 * As per GHCB spec, firmware failures should be communicated back to
> > +	 * the guest via SW_EXITINFO2 rather than be treated as immediately
> > +	 * fatal.
> > +	 */
> > +	ghcb_set_sw_exit_info_2(svm->sev_es.ghcb,
> > +				SNP_GUEST_ERR(ret ? SNP_GUEST_VMM_ERR_GENERIC : 0,
> > +					      fw_err));
> > +
> > +	ret = 1; /* resume guest */
> > +	kvm_release_pfn_dirty(resp_pfn);
> > +
> > +release_req:
> > +	kvm_release_pfn_clean(req_pfn);
> > +	return ret;
> > +}
Sean Christopherson June 26, 2024, 5:13 p.m. UTC | #5
On Wed, Jun 26, 2024, Michael Roth wrote:
> On Wed, Jun 26, 2024 at 06:58:09AM -0700, Sean Christopherson wrote:
> > [*] https://lore.kernel.org/all/20240229025759.1187910-1-stevensd@google.com
> > 
> > > +	if (is_error_noslot_pfn(req_pfn))
> > > +		return -EINVAL;
> > > +
> > > +	resp_pfn = gfn_to_pfn(kvm, gpa_to_gfn(resp_gpa));
> > > +	if (is_error_noslot_pfn(resp_pfn)) {
> > > +		ret = EINVAL;
> > > +		goto release_req;
> > > +	}
> > > +
> > > +	if (rmp_make_private(resp_pfn, 0, PG_LEVEL_4K, 0, true)) {
> > > +		ret = -EINVAL;
> > > +		kvm_release_pfn_clean(resp_pfn);
> > > +		goto release_req;
> > > +	}
> > 
> > I don't see how this is safe.  KVM holds no locks, i.e. can't guarantee that the
> > resp_pfn stays private for the duration of the operation.  And on the opposite
> 
> When the page is set to private with asid=0,immutable=true arguments,
> this puts the page in a special 'firmware-owned' state that specifically
> to avoid any changes to the page state happening from under the ASPs feet.
> The only way to switch the page to any other state at this point is to
> issue the SEV_CMD_SNP_PAGE_RECLAIM request to the ASP via
> snp_page_reclaim().
>
> I could see the guest shooting itself in the foot by issuing 2 guest
> requests with the same req_pfn/resp_pfn, but on the KVM side whichever
> request issues rmp_make_private() first would succeed, and then the
> 2nd request would generate an EINVAL to userspace.
> 
> In that sense, rmp_make_private()/snp_page_reclaim() sort of pair to
> lock/unlock a page that's being handed to the ASP. But this should be
> better documented either way.

What about the host kernel though?  I don't see anything here that ensures resp_pfn
isn't "regular" memory, i.e. that ensure the page isn't being concurrently accessed
by the host kernel (or some other userspace process).

Or is the "private" memory still accessible by the host?

> > resp_pfn stays private for the duration of the operation.  And on the opposite
> > side, KVM can't guarantee that resp_pfn isn't being actively used by something
> > in the kernel, e.g. KVM might induce an unexpected #PF(RMP).
> > 
> > Why can't KVM require that the response/destination page already be private?  I'm
> 
> Hmm. This is a bit tricky. The GHCB spec states:
> 
>   The Guest Request NAE event requires two unique pages, one page for the
>   request and one page for the response. Both pages must be assigned to the
>   hypervisor (shared). The guest must supply the guest physical address of
>   the pages (i.e., page aligned) as input.
> 
>   The hypervisor must translate the guest physical address (GPA) of each
>   page into a system physical address (SPA). The SPA is used to verify that
>   the request and response pages are assigned to the hypervisor.
> 
> At least for req_pfn, this makes sense because the header of the message
> is actually plain text, and KVM does need to parse it to read the message
> type from the header. It's just the req/resp payload that are encrypted
> by the guest/firmware, i.e. it's not relying on hardware-based memory
> encryption in this case.
> 
> Because of that though, I think we could potential address this by
> allocating the req/resp page as separate pages outside of guest memory,
> and simply copy the contents to/from the GPAs provided by the guest.
> I'll look more into this approach.
> 
> If we go that route I think some of the concerns above go away as well,
> but we might still need to a lock or something to serialize access to
> these separate/intermediate pages to avoid needed to allocate them every
> time or per-request.
> 
> > also somewhat confused by the reclaim below.  If KVM converts the response page
> > back to shared, doesn't that clobber the data?  If so, how does the guest actually
> > get the response?  I feel like I'm missing something.
> 
> In this case we're just setting it immutable/firmware-owned, which just
> happens to be equivalent (in terms of the RMP table) to a guest-owned page,
> but with rmp_entry.ASID=0/rmp_entry.immutable=true instead of
> rmp_entry.ASID=<guest asid>/rmp_entry.immutable=false. So the contents remain
> intact/readable after the reclaim.

Ah, I see the @asid=0 now.  The @asid=0,@immutable=true should be a separate API,
because IIUC, this always holds true:

	!asid == immutable

E.g.

static int rmp_assign_pfn(u64 pfn, u64 gpa, enum pg_level level, u32 asid, bool immutable)
{
	struct rmp_state state;

	memset(&state, 0, sizeof(state));
	state.assigned = 1;
	state.asid = asid;
	state.immutable = immutable;
	state.gpa = gpa;
	state.pagesize = PG_LEVEL_TO_RMP(level);

	return rmpupdate(pfn, &state);	
}

/* Transition a page to guest-owned/private state in the RMP table. */
int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, u32 asid)
{
	if (WARN_ON_ONCE(!asid))
		return -EIO;

	return rmp_assign_pfn(pfn, gpa, level, asid, false);
}
EXPORT_SYMBOL_GPL(rmp_make_private);

/* Transition a page to AMD-SP owned state in the RMP table. */
int rmp_make_firmware(u64 pfn, u64 gpa)
{
	return rmp_assign_pfn(pfn, gpa, PG_LEVEL_4K, 0, true);
}
EXPORT_SYMBOL_GPL(rmp_make_firmware);
Michael Roth June 26, 2024, 5:42 p.m. UTC | #6
On Wed, Jun 26, 2024 at 10:13:44AM -0700, Sean Christopherson wrote:
> On Wed, Jun 26, 2024, Michael Roth wrote:
> > On Wed, Jun 26, 2024 at 06:58:09AM -0700, Sean Christopherson wrote:
> > > [*] https://lore.kernel.org/all/20240229025759.1187910-1-stevensd@google.com
> > > 
> > > > +	if (is_error_noslot_pfn(req_pfn))
> > > > +		return -EINVAL;
> > > > +
> > > > +	resp_pfn = gfn_to_pfn(kvm, gpa_to_gfn(resp_gpa));
> > > > +	if (is_error_noslot_pfn(resp_pfn)) {
> > > > +		ret = EINVAL;
> > > > +		goto release_req;
> > > > +	}
> > > > +
> > > > +	if (rmp_make_private(resp_pfn, 0, PG_LEVEL_4K, 0, true)) {
> > > > +		ret = -EINVAL;
> > > > +		kvm_release_pfn_clean(resp_pfn);
> > > > +		goto release_req;
> > > > +	}
> > > 
> > > I don't see how this is safe.  KVM holds no locks, i.e. can't guarantee that the
> > > resp_pfn stays private for the duration of the operation.  And on the opposite
> > 
> > When the page is set to private with asid=0,immutable=true arguments,
> > this puts the page in a special 'firmware-owned' state that specifically
> > to avoid any changes to the page state happening from under the ASPs feet.
> > The only way to switch the page to any other state at this point is to
> > issue the SEV_CMD_SNP_PAGE_RECLAIM request to the ASP via
> > snp_page_reclaim().
> >
> > I could see the guest shooting itself in the foot by issuing 2 guest
> > requests with the same req_pfn/resp_pfn, but on the KVM side whichever
> > request issues rmp_make_private() first would succeed, and then the
> > 2nd request would generate an EINVAL to userspace.
> > 
> > In that sense, rmp_make_private()/snp_page_reclaim() sort of pair to
> > lock/unlock a page that's being handed to the ASP. But this should be
> > better documented either way.
> 
> What about the host kernel though?  I don't see anything here that ensures resp_pfn
> isn't "regular" memory, i.e. that ensure the page isn't being concurrently accessed
> by the host kernel (or some other userspace process).
> 
> Or is the "private" memory still accessible by the host?

It's accessible, but it is immutable according to RMP table, so so it would
require KVM to be elsewhere doing a write to the page, but that seems possible
if the guest is misbehaved. So I do think the RMP #PF concerns are warranted,
and that looking at using KVM-allocated intermediary/"bounce" pages to pass to
firmware is definitely worth looking into for v2 as that's just about the
safest way to guarantee nothing else will be writing to the page after
it gets set to immutable/firmware-owned.

> 
> > > resp_pfn stays private for the duration of the operation.  And on the opposite
> > > side, KVM can't guarantee that resp_pfn isn't being actively used by something
> > > in the kernel, e.g. KVM might induce an unexpected #PF(RMP).
> > > 
> > > Why can't KVM require that the response/destination page already be private?  I'm
> > 
> > Hmm. This is a bit tricky. The GHCB spec states:
> > 
> >   The Guest Request NAE event requires two unique pages, one page for the
> >   request and one page for the response. Both pages must be assigned to the
> >   hypervisor (shared). The guest must supply the guest physical address of
> >   the pages (i.e., page aligned) as input.
> > 
> >   The hypervisor must translate the guest physical address (GPA) of each
> >   page into a system physical address (SPA). The SPA is used to verify that
> >   the request and response pages are assigned to the hypervisor.
> > 
> > At least for req_pfn, this makes sense because the header of the message
> > is actually plain text, and KVM does need to parse it to read the message
> > type from the header. It's just the req/resp payload that are encrypted
> > by the guest/firmware, i.e. it's not relying on hardware-based memory
> > encryption in this case.
> > 
> > Because of that though, I think we could potential address this by
> > allocating the req/resp page as separate pages outside of guest memory,
> > and simply copy the contents to/from the GPAs provided by the guest.
> > I'll look more into this approach.
> > 
> > If we go that route I think some of the concerns above go away as well,
> > but we might still need to a lock or something to serialize access to
> > these separate/intermediate pages to avoid needed to allocate them every
> > time or per-request.
> > 
> > > also somewhat confused by the reclaim below.  If KVM converts the response page
> > > back to shared, doesn't that clobber the data?  If so, how does the guest actually
> > > get the response?  I feel like I'm missing something.
> > 
> > In this case we're just setting it immutable/firmware-owned, which just
> > happens to be equivalent (in terms of the RMP table) to a guest-owned page,
> > but with rmp_entry.ASID=0/rmp_entry.immutable=true instead of
> > rmp_entry.ASID=<guest asid>/rmp_entry.immutable=false. So the contents remain
> > intact/readable after the reclaim.
> 
> Ah, I see the @asid=0 now.  The @asid=0,@immutable=true should be a separate API,
> because IIUC, this always holds true:

That makes sense. I'll look at introducing rmp_make_firmware() as part of
the next revision.

-Mike

> 
> 	!asid == immutable
> 
> E.g.
> 
> static int rmp_assign_pfn(u64 pfn, u64 gpa, enum pg_level level, u32 asid, bool immutable)
> {
> 	struct rmp_state state;
> 
> 	memset(&state, 0, sizeof(state));
> 	state.assigned = 1;
> 	state.asid = asid;
> 	state.immutable = immutable;
> 	state.gpa = gpa;
> 	state.pagesize = PG_LEVEL_TO_RMP(level);
> 
> 	return rmpupdate(pfn, &state);	
> }
> 
> /* Transition a page to guest-owned/private state in the RMP table. */
> int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, u32 asid)
> {
> 	if (WARN_ON_ONCE(!asid))
> 		return -EIO;
> 
> 	return rmp_assign_pfn(pfn, gpa, level, asid, false);
> }
> EXPORT_SYMBOL_GPL(rmp_make_private);
> 
> /* Transition a page to AMD-SP owned state in the RMP table. */
> int rmp_make_firmware(u64 pfn, u64 gpa)
> {
> 	return rmp_assign_pfn(pfn, gpa, PG_LEVEL_4K, 0, true);
> }
> EXPORT_SYMBOL_GPL(rmp_make_firmware);
Sean Christopherson June 26, 2024, 7:54 p.m. UTC | #7
On Wed, Jun 26, 2024, Michael Roth wrote:
> On Wed, Jun 26, 2024 at 10:13:44AM -0700, Sean Christopherson wrote:
> > On Wed, Jun 26, 2024, Michael Roth wrote:
> > > On Wed, Jun 26, 2024 at 06:58:09AM -0700, Sean Christopherson wrote:
> > > > [*] https://lore.kernel.org/all/20240229025759.1187910-1-stevensd@google.com
> > > > 
> > > > > +	if (is_error_noslot_pfn(req_pfn))
> > > > > +		return -EINVAL;
> > > > > +
> > > > > +	resp_pfn = gfn_to_pfn(kvm, gpa_to_gfn(resp_gpa));
> > > > > +	if (is_error_noslot_pfn(resp_pfn)) {
> > > > > +		ret = EINVAL;
> > > > > +		goto release_req;
> > > > > +	}
> > > > > +
> > > > > +	if (rmp_make_private(resp_pfn, 0, PG_LEVEL_4K, 0, true)) {
> > > > > +		ret = -EINVAL;
> > > > > +		kvm_release_pfn_clean(resp_pfn);
> > > > > +		goto release_req;
> > > > > +	}
> > > > 
> > > > I don't see how this is safe.  KVM holds no locks, i.e. can't guarantee that the
> > > > resp_pfn stays private for the duration of the operation.  And on the opposite
> > > 
> > > When the page is set to private with asid=0,immutable=true arguments,
> > > this puts the page in a special 'firmware-owned' state that specifically
> > > to avoid any changes to the page state happening from under the ASPs feet.
> > > The only way to switch the page to any other state at this point is to
> > > issue the SEV_CMD_SNP_PAGE_RECLAIM request to the ASP via
> > > snp_page_reclaim().
> > >
> > > I could see the guest shooting itself in the foot by issuing 2 guest
> > > requests with the same req_pfn/resp_pfn, but on the KVM side whichever
> > > request issues rmp_make_private() first would succeed, and then the
> > > 2nd request would generate an EINVAL to userspace.
> > > 
> > > In that sense, rmp_make_private()/snp_page_reclaim() sort of pair to
> > > lock/unlock a page that's being handed to the ASP. But this should be
> > > better documented either way.
> > 
> > What about the host kernel though?  I don't see anything here that ensures resp_pfn
> > isn't "regular" memory, i.e. that ensure the page isn't being concurrently accessed
> > by the host kernel (or some other userspace process).
> > 
> > Or is the "private" memory still accessible by the host?
> 
> It's accessible, but it is immutable according to RMP table, so so it would
> require KVM to be elsewhere doing a write to the page,

I take it "immutable" means "read-only"?  If so, it would be super helpful to
document that in the APM.  I assumed "immutable" only meant that the RMP entry
itself is immutable, and that Assigned=AMD-SP is what prevented host accesses.

> but that seems possible if the guest is misbehaved. So I do think the RMP #PF
> concerns are warranted, and that looking at using KVM-allocated
> intermediary/"bounce" pages to pass to firmware is definitely worth looking
> into for v2 as that's just about the safest way to guarantee nothing else
> will be writing to the page after it gets set to immutable/firmware-owned.
Tom Lendacky June 27, 2024, 2:48 p.m. UTC | #8
On 6/26/24 14:54, Sean Christopherson wrote:
> On Wed, Jun 26, 2024, Michael Roth wrote:
>> On Wed, Jun 26, 2024 at 10:13:44AM -0700, Sean Christopherson wrote:
>>> On Wed, Jun 26, 2024, Michael Roth wrote:
>>>> On Wed, Jun 26, 2024 at 06:58:09AM -0700, Sean Christopherson wrote:
>>>>> [*] https://lore.kernel.org/all/20240229025759.1187910-1-stevensd@google.com
>>>>>
>>>>>> +	if (is_error_noslot_pfn(req_pfn))
>>>>>> +		return -EINVAL;
>>>>>> +
>>>>>> +	resp_pfn = gfn_to_pfn(kvm, gpa_to_gfn(resp_gpa));
>>>>>> +	if (is_error_noslot_pfn(resp_pfn)) {
>>>>>> +		ret = EINVAL;
>>>>>> +		goto release_req;
>>>>>> +	}
>>>>>> +
>>>>>> +	if (rmp_make_private(resp_pfn, 0, PG_LEVEL_4K, 0, true)) {
>>>>>> +		ret = -EINVAL;
>>>>>> +		kvm_release_pfn_clean(resp_pfn);
>>>>>> +		goto release_req;
>>>>>> +	}
>>>>>
>>>>> I don't see how this is safe.  KVM holds no locks, i.e. can't guarantee that the
>>>>> resp_pfn stays private for the duration of the operation.  And on the opposite
>>>>
>>>> When the page is set to private with asid=0,immutable=true arguments,
>>>> this puts the page in a special 'firmware-owned' state that specifically
>>>> to avoid any changes to the page state happening from under the ASPs feet.
>>>> The only way to switch the page to any other state at this point is to
>>>> issue the SEV_CMD_SNP_PAGE_RECLAIM request to the ASP via
>>>> snp_page_reclaim().
>>>>
>>>> I could see the guest shooting itself in the foot by issuing 2 guest
>>>> requests with the same req_pfn/resp_pfn, but on the KVM side whichever
>>>> request issues rmp_make_private() first would succeed, and then the
>>>> 2nd request would generate an EINVAL to userspace.
>>>>
>>>> In that sense, rmp_make_private()/snp_page_reclaim() sort of pair to
>>>> lock/unlock a page that's being handed to the ASP. But this should be
>>>> better documented either way.
>>>
>>> What about the host kernel though?  I don't see anything here that ensures resp_pfn
>>> isn't "regular" memory, i.e. that ensure the page isn't being concurrently accessed
>>> by the host kernel (or some other userspace process).
>>>
>>> Or is the "private" memory still accessible by the host?
>>
>> It's accessible, but it is immutable according to RMP table, so so it would
>> require KVM to be elsewhere doing a write to the page,
> 
> I take it "immutable" means "read-only"?  If so, it would be super helpful to
> document that in the APM.  I assumed "immutable" only meant that the RMP entry
> itself is immutable, and that Assigned=AMD-SP is what prevented host accesses.

Not quite. It depends on the page state associated with the page. For
example, Hypervisor-Fixed pages have the immutable bit set, but can be
read and written.

The page states are documented in the SNP API (Chapter 5, Page
Management):

https://www.amd.com/content/dam/amd/en/documents/epyc-technical-docs/specifications/56860.pdf

Thanks,
Tom

> 
>> but that seems possible if the guest is misbehaved. So I do think the RMP #PF
>> concerns are warranted, and that looking at using KVM-allocated
>> intermediary/"bounce" pages to pass to firmware is definitely worth looking
>> into for v2 as that's just about the safest way to guarantee nothing else
>> will be writing to the page after it gets set to immutable/firmware-owned.
Sean Christopherson June 27, 2024, 3:35 p.m. UTC | #9
On Thu, Jun 27, 2024, Tom Lendacky wrote:
> On 6/26/24 14:54, Sean Christopherson wrote:
> > On Wed, Jun 26, 2024, Michael Roth wrote:
> >>> What about the host kernel though?  I don't see anything here that ensures resp_pfn
> >>> isn't "regular" memory, i.e. that ensure the page isn't being concurrently accessed
> >>> by the host kernel (or some other userspace process).
> >>>
> >>> Or is the "private" memory still accessible by the host?
> >>
> >> It's accessible, but it is immutable according to RMP table, so so it would
> >> require KVM to be elsewhere doing a write to the page,
> > 
> > I take it "immutable" means "read-only"?  If so, it would be super helpful to
> > document that in the APM.  I assumed "immutable" only meant that the RMP entry
> > itself is immutable, and that Assigned=AMD-SP is what prevented host accesses.
> 
> Not quite. It depends on the page state associated with the page. For
> example, Hypervisor-Fixed pages have the immutable bit set, but can be
> read and written.
> 
> The page states are documented in the SNP API (Chapter 5, Page
> Management):

Heh, but then that section says:

  Pages in the Firmware state are owned by the firmware. Because the RMP.Immutable
                                                         ^^^^^^^^^^^^^^^^^^^^^^^^^
  bit is set, the hypervisor cannot write to Firmware pages nor alter the RMP entry
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  with the RMPUPDATE instruction.

which to me very clearly suggests that the RMP.Immutable bit is what makes the
page read-only.

Can you ask^Wbribe someone to add a "Table 11. Page State Properties" or something?
E.g. to explicitly list out the read vs. write protections and the state of the
page's data (encrypted, integrity-protected, zeroed?, etc).  I've read through
all of "5.2 Page States" and genuinely have no clue as to what protections most
of the states have.

Ah, never mind, I found "Table 15-39. RMP Memory Access Checks" in the APM.  FWIW,
that somewhat contradicts this blurb from the SNP ABI spec:

  The content of a Context page is encrypted and integrity protected so that the
  hypervisor cannot read or write to it.

I also find that statement confusing.  IIUC, the fact that the Context page is
encrypted and integrity protected doesn't actually have anything to do with the
host's ability to access the data.  The host _can_ read the data, but it will get
ciphertext.  But the host can't write the data because the page isn't HV-owned.

Actually, isn't the intregrity protected part wrong?  I thought one of the benefits
of SNP vs. ES is that the RMP means the VMSA doesn't have to be integrity protected,
and so VMRUN and #VMEXIT transitions are faster because the CPU doesn't need to do
as much work.
Peter Gonda June 27, 2024, 4:23 p.m. UTC | #10
On Thu, Jun 27, 2024 at 9:35 AM Sean Christopherson <seanjc@google.com> wrote:
>
> On Thu, Jun 27, 2024, Tom Lendacky wrote:
> > On 6/26/24 14:54, Sean Christopherson wrote:
> > > On Wed, Jun 26, 2024, Michael Roth wrote:
> > >>> What about the host kernel though?  I don't see anything here that ensures resp_pfn
> > >>> isn't "regular" memory, i.e. that ensure the page isn't being concurrently accessed
> > >>> by the host kernel (or some other userspace process).
> > >>>
> > >>> Or is the "private" memory still accessible by the host?
> > >>
> > >> It's accessible, but it is immutable according to RMP table, so so it would
> > >> require KVM to be elsewhere doing a write to the page,
> > >
> > > I take it "immutable" means "read-only"?  If so, it would be super helpful to
> > > document that in the APM.  I assumed "immutable" only meant that the RMP entry
> > > itself is immutable, and that Assigned=AMD-SP is what prevented host accesses.
> >
> > Not quite. It depends on the page state associated with the page. For
> > example, Hypervisor-Fixed pages have the immutable bit set, but can be
> > read and written.
> >
> > The page states are documented in the SNP API (Chapter 5, Page
> > Management):
>
> Heh, but then that section says:
>
>   Pages in the Firmware state are owned by the firmware. Because the RMP.Immutable
>                                                          ^^^^^^^^^^^^^^^^^^^^^^^^^
>   bit is set, the hypervisor cannot write to Firmware pages nor alter the RMP entry
>   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
>   with the RMPUPDATE instruction.
>
> which to me very clearly suggests that the RMP.Immutable bit is what makes the
> page read-only.
>
> Can you ask^Wbribe someone to add a "Table 11. Page State Properties" or something?
> E.g. to explicitly list out the read vs. write protections and the state of the
> page's data (encrypted, integrity-protected, zeroed?, etc).  I've read through
> all of "5.2 Page States" and genuinely have no clue as to what protections most
> of the states have.
>
> Ah, never mind, I found "Table 15-39. RMP Memory Access Checks" in the APM.  FWIW,
> that somewhat contradicts this blurb from the SNP ABI spec:
>
>   The content of a Context page is encrypted and integrity protected so that the
>   hypervisor cannot read or write to it.
>
> I also find that statement confusing.  IIUC, the fact that the Context page is
> encrypted and integrity protected doesn't actually have anything to do with the
> host's ability to access the data.  The host _can_ read the data, but it will get
> ciphertext.  But the host can't write the data because the page isn't HV-owned.
>
> Actually, isn't the intregrity protected part wrong?  I thought one of the benefits
> of SNP vs. ES is that the RMP means the VMSA doesn't have to be integrity protected,
> and so VMRUN and #VMEXIT transitions are faster because the CPU doesn't need to do
> as much work.

The statement is fairly vague so its a bit confusing that the
encryption scheme on the Context page doesn't include integrity. In
reality the encryption is AES-XTS with a physical address tweak so no
integrity. The integrity comes purely from the RMP with SNP, but it's
still integrity protected right? So I don't think that part is wrong.
Tom Lendacky June 27, 2024, 5:13 p.m. UTC | #11
On 6/27/24 10:35, Sean Christopherson wrote:
> On Thu, Jun 27, 2024, Tom Lendacky wrote:
>> On 6/26/24 14:54, Sean Christopherson wrote:
>>> On Wed, Jun 26, 2024, Michael Roth wrote:
>>>>> What about the host kernel though?  I don't see anything here that ensures resp_pfn
>>>>> isn't "regular" memory, i.e. that ensure the page isn't being concurrently accessed
>>>>> by the host kernel (or some other userspace process).
>>>>>
>>>>> Or is the "private" memory still accessible by the host?
>>>>
>>>> It's accessible, but it is immutable according to RMP table, so so it would
>>>> require KVM to be elsewhere doing a write to the page,
>>>
>>> I take it "immutable" means "read-only"?  If so, it would be super helpful to
>>> document that in the APM.  I assumed "immutable" only meant that the RMP entry
>>> itself is immutable, and that Assigned=AMD-SP is what prevented host accesses.
>>
>> Not quite. It depends on the page state associated with the page. For
>> example, Hypervisor-Fixed pages have the immutable bit set, but can be
>> read and written.
>>
>> The page states are documented in the SNP API (Chapter 5, Page
>> Management):
> 
> Heh, but then that section says:
> 
>   Pages in the Firmware state are owned by the firmware. Because the RMP.Immutable
>                                                          ^^^^^^^^^^^^^^^^^^^^^^^^^
>   bit is set, the hypervisor cannot write to Firmware pages nor alter the RMP entry
>   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
>   with the RMPUPDATE instruction.
> 
> which to me very clearly suggests that the RMP.Immutable bit is what makes the
> page read-only.
> 
> Can you ask^Wbribe someone to add a "Table 11. Page State Properties" or something?
> E.g. to explicitly list out the read vs. write protections and the state of the
> page's data (encrypted, integrity-protected, zeroed?, etc).  I've read through
> all of "5.2 Page States" and genuinely have no clue as to what protections most
> of the states have.

I'll get with the document owner and provide that feedback and see what we
can do to remove some of the ambiguity and improve upon it.

> 
> Ah, never mind, I found "Table 15-39. RMP Memory Access Checks" in the APM.  FWIW,
> that somewhat contradicts this blurb from the SNP ABI spec:
> 
>   The content of a Context page is encrypted and integrity protected so that the
>   hypervisor cannot read or write to it.
> 
> I also find that statement confusing.  IIUC, the fact that the Context page is
> encrypted and integrity protected doesn't actually have anything to do with the
> host's ability to access the data.  The host _can_ read the data, but it will get
> ciphertext.  But the host can't write the data because the page isn't HV-owned.
> 
> Actually, isn't the intregrity protected part wrong?  I thought one of the benefits

The RMP protection is what helps provide the integrity protection. So if a
hypervisor tries to write to a non-hypervisor owned page, it will generate
an RMP #PF. If the page can't be RMPUPDATEd (the immutable bit is set for
the page in the RMP), then the page can't be written to by the hypervisor.

If the page can be RMPUPDATEd and made hypervisor-owned and was previously
PVALIDATEd, then a guest access (as private) will generate a #NPF. If the
hypervisor then assigns the page to the guest (after having possibly
written to the page), the guest will then get a #VC and detect that the
integrity of the page may have been compromised and it should take action.

> of SNP vs. ES is that the RMP means the VMSA doesn't have to be integrity protected,
> and so VMRUN and #VMEXIT transitions are faster because the CPU doesn't need to do
> as much work.

That is one of the benefits. A page can only be turned into a VMSA page
via an SNP_LAUNCH_UPDATE (becoming part of the guest measurement) or via
RMPADJUST (which can only be executed in the guest), making it a guest
private page. If an RMPUPDATE is done by the hypervisor, the VMSA bit will
be cleared and the VMSA won't be runnable anymore.

Thanks,
Tom
Sean Christopherson June 27, 2024, 6:07 p.m. UTC | #12
On Thu, Jun 27, 2024, Tom Lendacky wrote:
> On 6/27/24 10:35, Sean Christopherson wrote:
> >> The page states are documented in the SNP API (Chapter 5, Page
> >> Management):
> > 
> > Heh, but then that section says:
> > 
> >   Pages in the Firmware state are owned by the firmware. Because the RMP.Immutable
> >                                                          ^^^^^^^^^^^^^^^^^^^^^^^^^
> >   bit is set, the hypervisor cannot write to Firmware pages nor alter the RMP entry
> >   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> >   with the RMPUPDATE instruction.
> > 
> > which to me very clearly suggests that the RMP.Immutable bit is what makes the
> > page read-only.
> > 
> > Can you ask^Wbribe someone to add a "Table 11. Page State Properties" or something?
> > E.g. to explicitly list out the read vs. write protections and the state of the
> > page's data (encrypted, integrity-protected, zeroed?, etc).  I've read through
> > all of "5.2 Page States" and genuinely have no clue as to what protections most
> > of the states have.
> 
> I'll get with the document owner and provide that feedback and see what we
> can do to remove some of the ambiguity and improve upon it.

Thanks!

> > Ah, never mind, I found "Table 15-39. RMP Memory Access Checks" in the APM.  FWIW,
> > that somewhat contradicts this blurb from the SNP ABI spec:
> > 
> >   The content of a Context page is encrypted and integrity protected so that the
> >   hypervisor cannot read or write to it.
> > 
> > I also find that statement confusing.  IIUC, the fact that the Context page is
> > encrypted and integrity protected doesn't actually have anything to do with the
> > host's ability to access the data.  The host _can_ read the data, but it will get
> > ciphertext.  But the host can't write the data because the page isn't HV-owned.
> > 
> > Actually, isn't the intregrity protected part wrong?  I thought one of the benefits
> 
> The RMP protection is what helps provide the integrity protection. So if a
> hypervisor tries to write to a non-hypervisor owned page, it will generate
> an RMP #PF. If the page can't be RMPUPDATEd (the immutable bit is set for
> the page in the RMP), then the page can't be written to by the hypervisor.

My confusion (ok, maybe it's more annoyance than true confusion) is that that
applies to _all_ non-hypervisor pages, not just Context pages.  Reading things
from a "the exception proves the rule" viewpoint, stating that Context pages are
encrypted and integrity protected strongly suggests that all other pages are NOT
encrypted and NOT integrity protected.
diff mbox series

Patch

diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index df8818759698..7338b987cadd 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -19,6 +19,7 @@ 
 #include <linux/misc_cgroup.h>
 #include <linux/processor.h>
 #include <linux/trace_events.h>
+#include <uapi/linux/sev-guest.h>
 
 #include <asm/pkru.h>
 #include <asm/trapnr.h>
@@ -3321,6 +3322,10 @@  static int sev_es_validate_vmgexit(struct vcpu_svm *svm)
 		if (!sev_snp_guest(vcpu->kvm) || !kvm_ghcb_sw_scratch_is_valid(svm))
 			goto vmgexit_err;
 		break;
+	case SVM_VMGEXIT_GUEST_REQUEST:
+		if (!sev_snp_guest(vcpu->kvm))
+			goto vmgexit_err;
+		break;
 	default:
 		reason = GHCB_ERR_INVALID_EVENT;
 		goto vmgexit_err;
@@ -3939,6 +3944,67 @@  static int sev_snp_ap_creation(struct vcpu_svm *svm)
 	return ret;
 }
 
+static int snp_handle_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t resp_gpa)
+{
+	struct sev_data_snp_guest_request data = {0};
+	struct kvm *kvm = svm->vcpu.kvm;
+	kvm_pfn_t req_pfn, resp_pfn;
+	sev_ret_code fw_err = 0;
+	int ret;
+
+	if (!sev_snp_guest(kvm) || !PAGE_ALIGNED(req_gpa) || !PAGE_ALIGNED(resp_gpa))
+		return -EINVAL;
+
+	req_pfn = gfn_to_pfn(kvm, gpa_to_gfn(req_gpa));
+	if (is_error_noslot_pfn(req_pfn))
+		return -EINVAL;
+
+	resp_pfn = gfn_to_pfn(kvm, gpa_to_gfn(resp_gpa));
+	if (is_error_noslot_pfn(resp_pfn)) {
+		ret = EINVAL;
+		goto release_req;
+	}
+
+	if (rmp_make_private(resp_pfn, 0, PG_LEVEL_4K, 0, true)) {
+		ret = -EINVAL;
+		kvm_release_pfn_clean(resp_pfn);
+		goto release_req;
+	}
+
+	data.gctx_paddr = __psp_pa(to_kvm_sev_info(kvm)->snp_context);
+	data.req_paddr = __sme_set(req_pfn << PAGE_SHIFT);
+	data.res_paddr = __sme_set(resp_pfn << PAGE_SHIFT);
+
+	ret = sev_issue_cmd(kvm, SEV_CMD_SNP_GUEST_REQUEST, &data, &fw_err);
+	if (ret)
+		return ret;
+
+	/*
+	 * If reclaim fails then there's a good chance the guest will no longer
+	 * be runnable so just let userspace terminate the guest.
+	 */
+	if (snp_page_reclaim(kvm, resp_pfn)) {
+		return -EIO;
+		goto release_req;
+	}
+
+	/*
+	 * As per GHCB spec, firmware failures should be communicated back to
+	 * the guest via SW_EXITINFO2 rather than be treated as immediately
+	 * fatal.
+	 */
+	ghcb_set_sw_exit_info_2(svm->sev_es.ghcb,
+				SNP_GUEST_ERR(ret ? SNP_GUEST_VMM_ERR_GENERIC : 0,
+					      fw_err));
+
+	ret = 1; /* resume guest */
+	kvm_release_pfn_dirty(resp_pfn);
+
+release_req:
+	kvm_release_pfn_clean(req_pfn);
+	return ret;
+}
+
 static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
 {
 	struct vmcb_control_area *control = &svm->vmcb->control;
@@ -4213,6 +4279,9 @@  int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
 
 		ret = 1;
 		break;
+	case SVM_VMGEXIT_GUEST_REQUEST:
+		ret = snp_handle_guest_req(svm, control->exit_info_1, control->exit_info_2);
+		break;
 	case SVM_VMGEXIT_UNSUPPORTED_EVENT:
 		vcpu_unimpl(vcpu,
 			    "vmgexit: unsupported event - exit_info_1=%#llx, exit_info_2=%#llx\n",
diff --git a/include/uapi/linux/sev-guest.h b/include/uapi/linux/sev-guest.h
index 154a87a1eca9..7bd78e258569 100644
--- a/include/uapi/linux/sev-guest.h
+++ b/include/uapi/linux/sev-guest.h
@@ -89,8 +89,17 @@  struct snp_ext_report_req {
 #define SNP_GUEST_FW_ERR_MASK		GENMASK_ULL(31, 0)
 #define SNP_GUEST_VMM_ERR_SHIFT		32
 #define SNP_GUEST_VMM_ERR(x)		(((u64)x) << SNP_GUEST_VMM_ERR_SHIFT)
+#define SNP_GUEST_FW_ERR(x)		((x) & SNP_GUEST_FW_ERR_MASK)
+#define SNP_GUEST_ERR(vmm_err, fw_err)	(SNP_GUEST_VMM_ERR(vmm_err) | \
+					 SNP_GUEST_FW_ERR(fw_err))
 
+/*
+ * The GHCB spec only formally defines INVALID_LEN/BUSY VMM errors, but define
+ * a GENERIC error code such that it won't ever conflict with GHCB-defined
+ * errors if any get added in the future.
+ */
 #define SNP_GUEST_VMM_ERR_INVALID_LEN	1
 #define SNP_GUEST_VMM_ERR_BUSY		2
+#define SNP_GUEST_VMM_ERR_GENERIC	BIT(31)
 
 #endif /* __UAPI_LINUX_SEV_GUEST_H_ */