Message ID | 20240621134041.3170480-2-michael.roth@amd.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | SEV-SNP: Add KVM support for attestation and KVM_EXIT_COCO | expand |
On 21/06/2024 14:40, Michael Roth wrote: > From: Brijesh Singh <brijesh.singh@amd.com> > > Version 2 of GHCB specification added support for the SNP Guest Request > Message NAE event. The event allows for an SEV-SNP guest to make > requests to the SEV-SNP firmware through hypervisor using the > SNP_GUEST_REQUEST API defined in the SEV-SNP firmware specification. > > This is used by guests primarily to request attestation reports from > firmware. There are other request types are available as well, but the > specifics of what guest requests are being made are opaque to the > hypervisor, which only serves as a proxy for the guest requests and > firmware responses. > > Implement handling for these events. > > Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> > Co-developed-by: Alexey Kardashevskiy <aik@amd.com> > Signed-off-by: Alexey Kardashevskiy <aik@amd.com> > Co-developed-by: Ashish Kalra <ashish.kalra@amd.com> > Signed-off-by: Ashish Kalra <ashish.kalra@amd.com> > Reviewed-by: Tom Lendacky <thomas.lendacky@amd.com> > [mdr: ensure FW command failures are indicated to guest, drop extended > request handling to be re-written as separate patch, massage commit] > Signed-off-by: Michael Roth <michael.roth@amd.com> > Message-ID: <20240501085210.2213060-19-michael.roth@amd.com> > Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> > --- > arch/x86/kvm/svm/sev.c | 69 ++++++++++++++++++++++++++++++++++ > include/uapi/linux/sev-guest.h | 9 +++++ > 2 files changed, 78 insertions(+) > > diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c > index df8818759698..7338b987cadd 100644 > --- a/arch/x86/kvm/svm/sev.c > +++ b/arch/x86/kvm/svm/sev.c > @@ -19,6 +19,7 @@ > #include <linux/misc_cgroup.h> > #include <linux/processor.h> > #include <linux/trace_events.h> > +#include <uapi/linux/sev-guest.h> > > #include <asm/pkru.h> > #include <asm/trapnr.h> > @@ -3321,6 +3322,10 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm) > if (!sev_snp_guest(vcpu->kvm) || !kvm_ghcb_sw_scratch_is_valid(svm)) > goto vmgexit_err; > break; > + case SVM_VMGEXIT_GUEST_REQUEST: > + if (!sev_snp_guest(vcpu->kvm)) > + goto vmgexit_err; > + break; > default: > reason = GHCB_ERR_INVALID_EVENT; > goto vmgexit_err; > @@ -3939,6 +3944,67 @@ static int sev_snp_ap_creation(struct vcpu_svm *svm) > return ret; > } > > +static int snp_handle_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t resp_gpa) > +{ > + struct sev_data_snp_guest_request data = {0}; > + struct kvm *kvm = svm->vcpu.kvm; > + kvm_pfn_t req_pfn, resp_pfn; > + sev_ret_code fw_err = 0; > + int ret; > + > + if (!sev_snp_guest(kvm) || !PAGE_ALIGNED(req_gpa) || !PAGE_ALIGNED(resp_gpa)) > + return -EINVAL; > + > + req_pfn = gfn_to_pfn(kvm, gpa_to_gfn(req_gpa)); > + if (is_error_noslot_pfn(req_pfn)) > + return -EINVAL; > + > + resp_pfn = gfn_to_pfn(kvm, gpa_to_gfn(resp_gpa)); > + if (is_error_noslot_pfn(resp_pfn)) { > + ret = EINVAL; > + goto release_req; > + } > + > + if (rmp_make_private(resp_pfn, 0, PG_LEVEL_4K, 0, true)) { > + ret = -EINVAL; > + kvm_release_pfn_clean(resp_pfn); > + goto release_req; > + } > + > + data.gctx_paddr = __psp_pa(to_kvm_sev_info(kvm)->snp_context); > + data.req_paddr = __sme_set(req_pfn << PAGE_SHIFT); > + data.res_paddr = __sme_set(resp_pfn << PAGE_SHIFT); > + > + ret = sev_issue_cmd(kvm, SEV_CMD_SNP_GUEST_REQUEST, &data, &fw_err); > + if (ret) > + return ret; Should this be goto release_req; ? Does resp_pfn need to be dealt with as well? > + > + /* > + * If reclaim fails then there's a good chance the guest will no longer > + * be runnable so just let userspace terminate the guest. > + */5 > + if (snp_page_reclaim(kvm, resp_pfn)) { > + return -EIO; Should this be setting ret = -EIO ? Next line is unreachable. Does resp_pfn need to be dealt with as well? > + goto release_req; > + } > + > + /* > + * As per GHCB spec, firmware failures should be communicated back to > + * the guest via SW_EXITINFO2 rather than be treated as immediately > + * fatal. > + */ > + ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, > + SNP_GUEST_ERR(ret ? SNP_GUEST_VMM_ERR_GENERIC : 0, > + fw_err)); > + > + ret = 1; /* resume guest */ > + kvm_release_pfn_dirty(resp_pfn); > + > +release_req: > + kvm_release_pfn_clean(req_pfn); > + return ret; > +} > + > static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) > { > struct vmcb_control_area *control = &svm->vmcb->control; > @@ -4213,6 +4279,9 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) > > ret = 1; > break; > + case SVM_VMGEXIT_GUEST_REQUEST: > + ret = snp_handle_guest_req(svm, control->exit_info_1, control->exit_info_2); > + break; > case SVM_VMGEXIT_UNSUPPORTED_EVENT: > vcpu_unimpl(vcpu, > "vmgexit: unsupported event - exit_info_1=%#llx, exit_info_2=%#llx\n", > diff --git a/include/uapi/linux/sev-guest.h b/include/uapi/linux/sev-guest.h > index 154a87a1eca9..7bd78e258569 100644 > --- a/include/uapi/linux/sev-guest.h > +++ b/include/uapi/linux/sev-guest.h > @@ -89,8 +89,17 @@ struct snp_ext_report_req { > #define SNP_GUEST_FW_ERR_MASK GENMASK_ULL(31, 0) > #define SNP_GUEST_VMM_ERR_SHIFT 32 > #define SNP_GUEST_VMM_ERR(x) (((u64)x) << SNP_GUEST_VMM_ERR_SHIFT) > +#define SNP_GUEST_FW_ERR(x) ((x) & SNP_GUEST_FW_ERR_MASK) > +#define SNP_GUEST_ERR(vmm_err, fw_err) (SNP_GUEST_VMM_ERR(vmm_err) | \ > + SNP_GUEST_FW_ERR(fw_err)) > > +/* > + * The GHCB spec only formally defines INVALID_LEN/BUSY VMM errors, but define > + * a GENERIC error code such that it won't ever conflict with GHCB-defined > + * errors if any get added in the future. > + */ > #define SNP_GUEST_VMM_ERR_INVALID_LEN 1 > #define SNP_GUEST_VMM_ERR_BUSY 2 > +#define SNP_GUEST_VMM_ERR_GENERIC BIT(31) > > #endif /* __UAPI_LINUX_SEV_GUEST_H_ */ Regards, Liam
On Fri, Jun 21, 2024 at 03:52:35PM +0000, Liam Merwick wrote: > On 21/06/2024 14:40, Michael Roth wrote: > > From: Brijesh Singh <brijesh.singh@amd.com> > > > > Version 2 of GHCB specification added support for the SNP Guest Request > > Message NAE event. The event allows for an SEV-SNP guest to make > > requests to the SEV-SNP firmware through hypervisor using the > > SNP_GUEST_REQUEST API defined in the SEV-SNP firmware specification. > > > > This is used by guests primarily to request attestation reports from > > firmware. There are other request types are available as well, but the > > specifics of what guest requests are being made are opaque to the > > hypervisor, which only serves as a proxy for the guest requests and > > firmware responses. > > > > Implement handling for these events. > > > > Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> > > Co-developed-by: Alexey Kardashevskiy <aik@amd.com> > > Signed-off-by: Alexey Kardashevskiy <aik@amd.com> > > Co-developed-by: Ashish Kalra <ashish.kalra@amd.com> > > Signed-off-by: Ashish Kalra <ashish.kalra@amd.com> > > Reviewed-by: Tom Lendacky <thomas.lendacky@amd.com> > > [mdr: ensure FW command failures are indicated to guest, drop extended > > request handling to be re-written as separate patch, massage commit] > > Signed-off-by: Michael Roth <michael.roth@amd.com> > > Message-ID: <20240501085210.2213060-19-michael.roth@amd.com> > > Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> > > --- > > arch/x86/kvm/svm/sev.c | 69 ++++++++++++++++++++++++++++++++++ > > include/uapi/linux/sev-guest.h | 9 +++++ > > 2 files changed, 78 insertions(+) > > > > diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c > > index df8818759698..7338b987cadd 100644 > > --- a/arch/x86/kvm/svm/sev.c > > +++ b/arch/x86/kvm/svm/sev.c > > @@ -19,6 +19,7 @@ > > #include <linux/misc_cgroup.h> > > #include <linux/processor.h> > > #include <linux/trace_events.h> > > +#include <uapi/linux/sev-guest.h> > > > > #include <asm/pkru.h> > > #include <asm/trapnr.h> > > @@ -3321,6 +3322,10 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm) > > if (!sev_snp_guest(vcpu->kvm) || !kvm_ghcb_sw_scratch_is_valid(svm)) > > goto vmgexit_err; > > break; > > + case SVM_VMGEXIT_GUEST_REQUEST: > > + if (!sev_snp_guest(vcpu->kvm)) > > + goto vmgexit_err; > > + break; > > default: > > reason = GHCB_ERR_INVALID_EVENT; > > goto vmgexit_err; > > @@ -3939,6 +3944,67 @@ static int sev_snp_ap_creation(struct vcpu_svm *svm) > > return ret; > > } > > > > +static int snp_handle_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t resp_gpa) > > +{ > > + struct sev_data_snp_guest_request data = {0}; > > + struct kvm *kvm = svm->vcpu.kvm; > > + kvm_pfn_t req_pfn, resp_pfn; > > + sev_ret_code fw_err = 0; > > + int ret; > > + > > + if (!sev_snp_guest(kvm) || !PAGE_ALIGNED(req_gpa) || !PAGE_ALIGNED(resp_gpa)) > > + return -EINVAL; > > + > > + req_pfn = gfn_to_pfn(kvm, gpa_to_gfn(req_gpa)); > > + if (is_error_noslot_pfn(req_pfn)) > > + return -EINVAL; > > + > > + resp_pfn = gfn_to_pfn(kvm, gpa_to_gfn(resp_gpa)); > > + if (is_error_noslot_pfn(resp_pfn)) { > > + ret = EINVAL; > > + goto release_req; > > + } > > + > > + if (rmp_make_private(resp_pfn, 0, PG_LEVEL_4K, 0, true)) { > > + ret = -EINVAL; > > + kvm_release_pfn_clean(resp_pfn); > > + goto release_req; > > + } > > + > > + data.gctx_paddr = __psp_pa(to_kvm_sev_info(kvm)->snp_context); > > + data.req_paddr = __sme_set(req_pfn << PAGE_SHIFT); > > + data.res_paddr = __sme_set(resp_pfn << PAGE_SHIFT); > > + > > + ret = sev_issue_cmd(kvm, SEV_CMD_SNP_GUEST_REQUEST, &data, &fw_err); > > + if (ret) > > + return ret; > > > Should this be goto release_req; ? > Does resp_pfn need to be dealt with as well? Argh... yes. I folded some helper functions into here and missed updating some of the return sites. I'll respond to this patch with a revised version. Sorry for the noise. -Mike
On Fri, Jun 21, 2024, Michael Roth wrote: > @@ -3939,6 +3944,67 @@ static int sev_snp_ap_creation(struct vcpu_svm *svm) > return ret; > } > > +static int snp_handle_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t resp_gpa) > +{ > + struct sev_data_snp_guest_request data = {0}; > + struct kvm *kvm = svm->vcpu.kvm; > + kvm_pfn_t req_pfn, resp_pfn; > + sev_ret_code fw_err = 0; > + int ret; > + > + if (!sev_snp_guest(kvm) || !PAGE_ALIGNED(req_gpa) || !PAGE_ALIGNED(resp_gpa)) > + return -EINVAL; > + > + req_pfn = gfn_to_pfn(kvm, gpa_to_gfn(req_gpa)); This is going to sound odd, but I think we should use gfn_to_page(), i.e. require that the page be a refcounted page so that it can be pinned. Long story short, one of my learnings from the whole non-refcounted pages saga is that doing DMA to unpinned pages outside of mmu_lock is wildly unsafe, and for all intents and purposes the ASP is a device doing DMA. I'm also pretty sure KVM should actually *pin* pages, as in FOLL_PIN, but I'm ok tackling that later. For now, using gfn_to_pages() would avoid creating ABI (DMA to VM_PFNMAP and/or VM_MIXEDMAP memory) that KVM probably doesn't want to support in the long term. [*] https://lore.kernel.org/all/20240229025759.1187910-1-stevensd@google.com > + if (is_error_noslot_pfn(req_pfn)) > + return -EINVAL; > + > + resp_pfn = gfn_to_pfn(kvm, gpa_to_gfn(resp_gpa)); > + if (is_error_noslot_pfn(resp_pfn)) { > + ret = EINVAL; > + goto release_req; > + } > + > + if (rmp_make_private(resp_pfn, 0, PG_LEVEL_4K, 0, true)) { > + ret = -EINVAL; > + kvm_release_pfn_clean(resp_pfn); > + goto release_req; > + } I don't see how this is safe. KVM holds no locks, i.e. can't guarantee that the resp_pfn stays private for the duration of the operation. And on the opposite side, KVM can't guarantee that resp_pfn isn't being actively used by something in the kernel, e.g. KVM might induce an unexpected #PF(RMP). Why can't KVM require that the response/destination page already be private? I'm also somewhat confused by the reclaim below. If KVM converts the response page back to shared, doesn't that clobber the data? If so, how does the guest actually get the response? I feel like I'm missing something. Regardless of whether or not I'm missing something, this needs comments, and the changelog needs to be rewritten with --verbose to explain what's going on. > + data.gctx_paddr = __psp_pa(to_kvm_sev_info(kvm)->snp_context); > + data.req_paddr = __sme_set(req_pfn << PAGE_SHIFT); > + data.res_paddr = __sme_set(resp_pfn << PAGE_SHIFT); > + > + ret = sev_issue_cmd(kvm, SEV_CMD_SNP_GUEST_REQUEST, &data, &fw_err); > + if (ret) > + return ret; This leaks both req_pfn and resp_pfn. > + > + /* > + * If reclaim fails then there's a good chance the guest will no longer > + * be runnable so just let userspace terminate the guest. > + */ > + if (snp_page_reclaim(kvm, resp_pfn)) { > + return -EIO; > + goto release_req; > + } > + > + /* > + * As per GHCB spec, firmware failures should be communicated back to > + * the guest via SW_EXITINFO2 rather than be treated as immediately > + * fatal. > + */ > + ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, > + SNP_GUEST_ERR(ret ? SNP_GUEST_VMM_ERR_GENERIC : 0, > + fw_err)); > + > + ret = 1; /* resume guest */ > + kvm_release_pfn_dirty(resp_pfn); > + > +release_req: > + kvm_release_pfn_clean(req_pfn); > + return ret; > +}
On Wed, Jun 26, 2024 at 06:58:09AM -0700, Sean Christopherson wrote: > On Fri, Jun 21, 2024, Michael Roth wrote: > > @@ -3939,6 +3944,67 @@ static int sev_snp_ap_creation(struct vcpu_svm *svm) > > return ret; > > } > > > > +static int snp_handle_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t resp_gpa) > > +{ > > + struct sev_data_snp_guest_request data = {0}; > > + struct kvm *kvm = svm->vcpu.kvm; > > + kvm_pfn_t req_pfn, resp_pfn; > > + sev_ret_code fw_err = 0; > > + int ret; > > + > > + if (!sev_snp_guest(kvm) || !PAGE_ALIGNED(req_gpa) || !PAGE_ALIGNED(resp_gpa)) > > + return -EINVAL; > > + > > + req_pfn = gfn_to_pfn(kvm, gpa_to_gfn(req_gpa)); > > This is going to sound odd, but I think we should use gfn_to_page(), i.e. require > that the page be a refcounted page so that it can be pinned. Long story short, > one of my learnings from the whole non-refcounted pages saga is that doing DMA > to unpinned pages outside of mmu_lock is wildly unsafe, and for all intents and > purposes the ASP is a device doing DMA. I'm also pretty sure KVM should actually > *pin* pages, as in FOLL_PIN, but I'm ok tackling that later. > > For now, using gfn_to_pages() would avoid creating ABI (DMA to VM_PFNMAP and/or > VM_MIXEDMAP memory) that KVM probably doesn't want to support in the long term. That seems reasonable. Will switch this to gfn_to_page(). > > [*] https://lore.kernel.org/all/20240229025759.1187910-1-stevensd@google.com > > > + if (is_error_noslot_pfn(req_pfn)) > > + return -EINVAL; > > + > > + resp_pfn = gfn_to_pfn(kvm, gpa_to_gfn(resp_gpa)); > > + if (is_error_noslot_pfn(resp_pfn)) { > > + ret = EINVAL; > > + goto release_req; > > + } > > + > > + if (rmp_make_private(resp_pfn, 0, PG_LEVEL_4K, 0, true)) { > > + ret = -EINVAL; > > + kvm_release_pfn_clean(resp_pfn); > > + goto release_req; > > + } > > I don't see how this is safe. KVM holds no locks, i.e. can't guarantee that the > resp_pfn stays private for the duration of the operation. And on the opposite When the page is set to private with asid=0,immutable=true arguments, this puts the page in a special 'firmware-owned' state that specifically to avoid any changes to the page state happening from under the ASPs feet. The only way to switch the page to any other state at this point is to issue the SEV_CMD_SNP_PAGE_RECLAIM request to the ASP via snp_page_reclaim(). I could see the guest shooting itself in the foot by issuing 2 guest requests with the same req_pfn/resp_pfn, but on the KVM side whichever request issues rmp_make_private() first would succeed, and then the 2nd request would generate an EINVAL to userspace. In that sense, rmp_make_private()/snp_page_reclaim() sort of pair to lock/unlock a page that's being handed to the ASP. But this should be better documented either way. > resp_pfn stays private for the duration of the operation. And on the opposite > side, KVM can't guarantee that resp_pfn isn't being actively used by something > in the kernel, e.g. KVM might induce an unexpected #PF(RMP). > > Why can't KVM require that the response/destination page already be private? I'm Hmm. This is a bit tricky. The GHCB spec states: The Guest Request NAE event requires two unique pages, one page for the request and one page for the response. Both pages must be assigned to the hypervisor (shared). The guest must supply the guest physical address of the pages (i.e., page aligned) as input. The hypervisor must translate the guest physical address (GPA) of each page into a system physical address (SPA). The SPA is used to verify that the request and response pages are assigned to the hypervisor. At least for req_pfn, this makes sense because the header of the message is actually plain text, and KVM does need to parse it to read the message type from the header. It's just the req/resp payload that are encrypted by the guest/firmware, i.e. it's not relying on hardware-based memory encryption in this case. Because of that though, I think we could potential address this by allocating the req/resp page as separate pages outside of guest memory, and simply copy the contents to/from the GPAs provided by the guest. I'll look more into this approach. If we go that route I think some of the concerns above go away as well, but we might still need to a lock or something to serialize access to these separate/intermediate pages to avoid needed to allocate them every time or per-request. > also somewhat confused by the reclaim below. If KVM converts the response page > back to shared, doesn't that clobber the data? If so, how does the guest actually > get the response? I feel like I'm missing something. In this case we're just setting it immutable/firmware-owned, which just happens to be equivalent (in terms of the RMP table) to a guest-owned page, but with rmp_entry.ASID=0/rmp_entry.immutable=true instead of rmp_entry.ASID=<guest asid>/rmp_entry.immutable=false. So the contents remain intact/readable after the reclaim. > > Regardless of whether or not I'm missing something, this needs comments, and the > changelog needs to be rewritten with --verbose to explain what's going on. Sure, will add more comments and details in the commit msg for v2. > > > + data.gctx_paddr = __psp_pa(to_kvm_sev_info(kvm)->snp_context); > > + data.req_paddr = __sme_set(req_pfn << PAGE_SHIFT); > > + data.res_paddr = __sme_set(resp_pfn << PAGE_SHIFT); > > + > > + ret = sev_issue_cmd(kvm, SEV_CMD_SNP_GUEST_REQUEST, &data, &fw_err); > > + if (ret) > > + return ret; > > This leaks both req_pfn and resp_pfn. Yes, this one should be fixed up by the v1-revised version of the patch. Thanks! -Mike > > > + > > + /* > > + * If reclaim fails then there's a good chance the guest will no longer > > + * be runnable so just let userspace terminate the guest. > > + */ > > + if (snp_page_reclaim(kvm, resp_pfn)) { > > + return -EIO; > > + goto release_req; > > + } > > + > > + /* > > + * As per GHCB spec, firmware failures should be communicated back to > > + * the guest via SW_EXITINFO2 rather than be treated as immediately > > + * fatal. > > + */ > > + ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, > > + SNP_GUEST_ERR(ret ? SNP_GUEST_VMM_ERR_GENERIC : 0, > > + fw_err)); > > + > > + ret = 1; /* resume guest */ > > + kvm_release_pfn_dirty(resp_pfn); > > + > > +release_req: > > + kvm_release_pfn_clean(req_pfn); > > + return ret; > > +}
On Wed, Jun 26, 2024, Michael Roth wrote: > On Wed, Jun 26, 2024 at 06:58:09AM -0700, Sean Christopherson wrote: > > [*] https://lore.kernel.org/all/20240229025759.1187910-1-stevensd@google.com > > > > > + if (is_error_noslot_pfn(req_pfn)) > > > + return -EINVAL; > > > + > > > + resp_pfn = gfn_to_pfn(kvm, gpa_to_gfn(resp_gpa)); > > > + if (is_error_noslot_pfn(resp_pfn)) { > > > + ret = EINVAL; > > > + goto release_req; > > > + } > > > + > > > + if (rmp_make_private(resp_pfn, 0, PG_LEVEL_4K, 0, true)) { > > > + ret = -EINVAL; > > > + kvm_release_pfn_clean(resp_pfn); > > > + goto release_req; > > > + } > > > > I don't see how this is safe. KVM holds no locks, i.e. can't guarantee that the > > resp_pfn stays private for the duration of the operation. And on the opposite > > When the page is set to private with asid=0,immutable=true arguments, > this puts the page in a special 'firmware-owned' state that specifically > to avoid any changes to the page state happening from under the ASPs feet. > The only way to switch the page to any other state at this point is to > issue the SEV_CMD_SNP_PAGE_RECLAIM request to the ASP via > snp_page_reclaim(). > > I could see the guest shooting itself in the foot by issuing 2 guest > requests with the same req_pfn/resp_pfn, but on the KVM side whichever > request issues rmp_make_private() first would succeed, and then the > 2nd request would generate an EINVAL to userspace. > > In that sense, rmp_make_private()/snp_page_reclaim() sort of pair to > lock/unlock a page that's being handed to the ASP. But this should be > better documented either way. What about the host kernel though? I don't see anything here that ensures resp_pfn isn't "regular" memory, i.e. that ensure the page isn't being concurrently accessed by the host kernel (or some other userspace process). Or is the "private" memory still accessible by the host? > > resp_pfn stays private for the duration of the operation. And on the opposite > > side, KVM can't guarantee that resp_pfn isn't being actively used by something > > in the kernel, e.g. KVM might induce an unexpected #PF(RMP). > > > > Why can't KVM require that the response/destination page already be private? I'm > > Hmm. This is a bit tricky. The GHCB spec states: > > The Guest Request NAE event requires two unique pages, one page for the > request and one page for the response. Both pages must be assigned to the > hypervisor (shared). The guest must supply the guest physical address of > the pages (i.e., page aligned) as input. > > The hypervisor must translate the guest physical address (GPA) of each > page into a system physical address (SPA). The SPA is used to verify that > the request and response pages are assigned to the hypervisor. > > At least for req_pfn, this makes sense because the header of the message > is actually plain text, and KVM does need to parse it to read the message > type from the header. It's just the req/resp payload that are encrypted > by the guest/firmware, i.e. it's not relying on hardware-based memory > encryption in this case. > > Because of that though, I think we could potential address this by > allocating the req/resp page as separate pages outside of guest memory, > and simply copy the contents to/from the GPAs provided by the guest. > I'll look more into this approach. > > If we go that route I think some of the concerns above go away as well, > but we might still need to a lock or something to serialize access to > these separate/intermediate pages to avoid needed to allocate them every > time or per-request. > > > also somewhat confused by the reclaim below. If KVM converts the response page > > back to shared, doesn't that clobber the data? If so, how does the guest actually > > get the response? I feel like I'm missing something. > > In this case we're just setting it immutable/firmware-owned, which just > happens to be equivalent (in terms of the RMP table) to a guest-owned page, > but with rmp_entry.ASID=0/rmp_entry.immutable=true instead of > rmp_entry.ASID=<guest asid>/rmp_entry.immutable=false. So the contents remain > intact/readable after the reclaim. Ah, I see the @asid=0 now. The @asid=0,@immutable=true should be a separate API, because IIUC, this always holds true: !asid == immutable E.g. static int rmp_assign_pfn(u64 pfn, u64 gpa, enum pg_level level, u32 asid, bool immutable) { struct rmp_state state; memset(&state, 0, sizeof(state)); state.assigned = 1; state.asid = asid; state.immutable = immutable; state.gpa = gpa; state.pagesize = PG_LEVEL_TO_RMP(level); return rmpupdate(pfn, &state); } /* Transition a page to guest-owned/private state in the RMP table. */ int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, u32 asid) { if (WARN_ON_ONCE(!asid)) return -EIO; return rmp_assign_pfn(pfn, gpa, level, asid, false); } EXPORT_SYMBOL_GPL(rmp_make_private); /* Transition a page to AMD-SP owned state in the RMP table. */ int rmp_make_firmware(u64 pfn, u64 gpa) { return rmp_assign_pfn(pfn, gpa, PG_LEVEL_4K, 0, true); } EXPORT_SYMBOL_GPL(rmp_make_firmware);
On Wed, Jun 26, 2024 at 10:13:44AM -0700, Sean Christopherson wrote: > On Wed, Jun 26, 2024, Michael Roth wrote: > > On Wed, Jun 26, 2024 at 06:58:09AM -0700, Sean Christopherson wrote: > > > [*] https://lore.kernel.org/all/20240229025759.1187910-1-stevensd@google.com > > > > > > > + if (is_error_noslot_pfn(req_pfn)) > > > > + return -EINVAL; > > > > + > > > > + resp_pfn = gfn_to_pfn(kvm, gpa_to_gfn(resp_gpa)); > > > > + if (is_error_noslot_pfn(resp_pfn)) { > > > > + ret = EINVAL; > > > > + goto release_req; > > > > + } > > > > + > > > > + if (rmp_make_private(resp_pfn, 0, PG_LEVEL_4K, 0, true)) { > > > > + ret = -EINVAL; > > > > + kvm_release_pfn_clean(resp_pfn); > > > > + goto release_req; > > > > + } > > > > > > I don't see how this is safe. KVM holds no locks, i.e. can't guarantee that the > > > resp_pfn stays private for the duration of the operation. And on the opposite > > > > When the page is set to private with asid=0,immutable=true arguments, > > this puts the page in a special 'firmware-owned' state that specifically > > to avoid any changes to the page state happening from under the ASPs feet. > > The only way to switch the page to any other state at this point is to > > issue the SEV_CMD_SNP_PAGE_RECLAIM request to the ASP via > > snp_page_reclaim(). > > > > I could see the guest shooting itself in the foot by issuing 2 guest > > requests with the same req_pfn/resp_pfn, but on the KVM side whichever > > request issues rmp_make_private() first would succeed, and then the > > 2nd request would generate an EINVAL to userspace. > > > > In that sense, rmp_make_private()/snp_page_reclaim() sort of pair to > > lock/unlock a page that's being handed to the ASP. But this should be > > better documented either way. > > What about the host kernel though? I don't see anything here that ensures resp_pfn > isn't "regular" memory, i.e. that ensure the page isn't being concurrently accessed > by the host kernel (or some other userspace process). > > Or is the "private" memory still accessible by the host? It's accessible, but it is immutable according to RMP table, so so it would require KVM to be elsewhere doing a write to the page, but that seems possible if the guest is misbehaved. So I do think the RMP #PF concerns are warranted, and that looking at using KVM-allocated intermediary/"bounce" pages to pass to firmware is definitely worth looking into for v2 as that's just about the safest way to guarantee nothing else will be writing to the page after it gets set to immutable/firmware-owned. > > > > resp_pfn stays private for the duration of the operation. And on the opposite > > > side, KVM can't guarantee that resp_pfn isn't being actively used by something > > > in the kernel, e.g. KVM might induce an unexpected #PF(RMP). > > > > > > Why can't KVM require that the response/destination page already be private? I'm > > > > Hmm. This is a bit tricky. The GHCB spec states: > > > > The Guest Request NAE event requires two unique pages, one page for the > > request and one page for the response. Both pages must be assigned to the > > hypervisor (shared). The guest must supply the guest physical address of > > the pages (i.e., page aligned) as input. > > > > The hypervisor must translate the guest physical address (GPA) of each > > page into a system physical address (SPA). The SPA is used to verify that > > the request and response pages are assigned to the hypervisor. > > > > At least for req_pfn, this makes sense because the header of the message > > is actually plain text, and KVM does need to parse it to read the message > > type from the header. It's just the req/resp payload that are encrypted > > by the guest/firmware, i.e. it's not relying on hardware-based memory > > encryption in this case. > > > > Because of that though, I think we could potential address this by > > allocating the req/resp page as separate pages outside of guest memory, > > and simply copy the contents to/from the GPAs provided by the guest. > > I'll look more into this approach. > > > > If we go that route I think some of the concerns above go away as well, > > but we might still need to a lock or something to serialize access to > > these separate/intermediate pages to avoid needed to allocate them every > > time or per-request. > > > > > also somewhat confused by the reclaim below. If KVM converts the response page > > > back to shared, doesn't that clobber the data? If so, how does the guest actually > > > get the response? I feel like I'm missing something. > > > > In this case we're just setting it immutable/firmware-owned, which just > > happens to be equivalent (in terms of the RMP table) to a guest-owned page, > > but with rmp_entry.ASID=0/rmp_entry.immutable=true instead of > > rmp_entry.ASID=<guest asid>/rmp_entry.immutable=false. So the contents remain > > intact/readable after the reclaim. > > Ah, I see the @asid=0 now. The @asid=0,@immutable=true should be a separate API, > because IIUC, this always holds true: That makes sense. I'll look at introducing rmp_make_firmware() as part of the next revision. -Mike > > !asid == immutable > > E.g. > > static int rmp_assign_pfn(u64 pfn, u64 gpa, enum pg_level level, u32 asid, bool immutable) > { > struct rmp_state state; > > memset(&state, 0, sizeof(state)); > state.assigned = 1; > state.asid = asid; > state.immutable = immutable; > state.gpa = gpa; > state.pagesize = PG_LEVEL_TO_RMP(level); > > return rmpupdate(pfn, &state); > } > > /* Transition a page to guest-owned/private state in the RMP table. */ > int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, u32 asid) > { > if (WARN_ON_ONCE(!asid)) > return -EIO; > > return rmp_assign_pfn(pfn, gpa, level, asid, false); > } > EXPORT_SYMBOL_GPL(rmp_make_private); > > /* Transition a page to AMD-SP owned state in the RMP table. */ > int rmp_make_firmware(u64 pfn, u64 gpa) > { > return rmp_assign_pfn(pfn, gpa, PG_LEVEL_4K, 0, true); > } > EXPORT_SYMBOL_GPL(rmp_make_firmware);
On Wed, Jun 26, 2024, Michael Roth wrote: > On Wed, Jun 26, 2024 at 10:13:44AM -0700, Sean Christopherson wrote: > > On Wed, Jun 26, 2024, Michael Roth wrote: > > > On Wed, Jun 26, 2024 at 06:58:09AM -0700, Sean Christopherson wrote: > > > > [*] https://lore.kernel.org/all/20240229025759.1187910-1-stevensd@google.com > > > > > > > > > + if (is_error_noslot_pfn(req_pfn)) > > > > > + return -EINVAL; > > > > > + > > > > > + resp_pfn = gfn_to_pfn(kvm, gpa_to_gfn(resp_gpa)); > > > > > + if (is_error_noslot_pfn(resp_pfn)) { > > > > > + ret = EINVAL; > > > > > + goto release_req; > > > > > + } > > > > > + > > > > > + if (rmp_make_private(resp_pfn, 0, PG_LEVEL_4K, 0, true)) { > > > > > + ret = -EINVAL; > > > > > + kvm_release_pfn_clean(resp_pfn); > > > > > + goto release_req; > > > > > + } > > > > > > > > I don't see how this is safe. KVM holds no locks, i.e. can't guarantee that the > > > > resp_pfn stays private for the duration of the operation. And on the opposite > > > > > > When the page is set to private with asid=0,immutable=true arguments, > > > this puts the page in a special 'firmware-owned' state that specifically > > > to avoid any changes to the page state happening from under the ASPs feet. > > > The only way to switch the page to any other state at this point is to > > > issue the SEV_CMD_SNP_PAGE_RECLAIM request to the ASP via > > > snp_page_reclaim(). > > > > > > I could see the guest shooting itself in the foot by issuing 2 guest > > > requests with the same req_pfn/resp_pfn, but on the KVM side whichever > > > request issues rmp_make_private() first would succeed, and then the > > > 2nd request would generate an EINVAL to userspace. > > > > > > In that sense, rmp_make_private()/snp_page_reclaim() sort of pair to > > > lock/unlock a page that's being handed to the ASP. But this should be > > > better documented either way. > > > > What about the host kernel though? I don't see anything here that ensures resp_pfn > > isn't "regular" memory, i.e. that ensure the page isn't being concurrently accessed > > by the host kernel (or some other userspace process). > > > > Or is the "private" memory still accessible by the host? > > It's accessible, but it is immutable according to RMP table, so so it would > require KVM to be elsewhere doing a write to the page, I take it "immutable" means "read-only"? If so, it would be super helpful to document that in the APM. I assumed "immutable" only meant that the RMP entry itself is immutable, and that Assigned=AMD-SP is what prevented host accesses. > but that seems possible if the guest is misbehaved. So I do think the RMP #PF > concerns are warranted, and that looking at using KVM-allocated > intermediary/"bounce" pages to pass to firmware is definitely worth looking > into for v2 as that's just about the safest way to guarantee nothing else > will be writing to the page after it gets set to immutable/firmware-owned.
On 6/26/24 14:54, Sean Christopherson wrote: > On Wed, Jun 26, 2024, Michael Roth wrote: >> On Wed, Jun 26, 2024 at 10:13:44AM -0700, Sean Christopherson wrote: >>> On Wed, Jun 26, 2024, Michael Roth wrote: >>>> On Wed, Jun 26, 2024 at 06:58:09AM -0700, Sean Christopherson wrote: >>>>> [*] https://lore.kernel.org/all/20240229025759.1187910-1-stevensd@google.com >>>>> >>>>>> + if (is_error_noslot_pfn(req_pfn)) >>>>>> + return -EINVAL; >>>>>> + >>>>>> + resp_pfn = gfn_to_pfn(kvm, gpa_to_gfn(resp_gpa)); >>>>>> + if (is_error_noslot_pfn(resp_pfn)) { >>>>>> + ret = EINVAL; >>>>>> + goto release_req; >>>>>> + } >>>>>> + >>>>>> + if (rmp_make_private(resp_pfn, 0, PG_LEVEL_4K, 0, true)) { >>>>>> + ret = -EINVAL; >>>>>> + kvm_release_pfn_clean(resp_pfn); >>>>>> + goto release_req; >>>>>> + } >>>>> >>>>> I don't see how this is safe. KVM holds no locks, i.e. can't guarantee that the >>>>> resp_pfn stays private for the duration of the operation. And on the opposite >>>> >>>> When the page is set to private with asid=0,immutable=true arguments, >>>> this puts the page in a special 'firmware-owned' state that specifically >>>> to avoid any changes to the page state happening from under the ASPs feet. >>>> The only way to switch the page to any other state at this point is to >>>> issue the SEV_CMD_SNP_PAGE_RECLAIM request to the ASP via >>>> snp_page_reclaim(). >>>> >>>> I could see the guest shooting itself in the foot by issuing 2 guest >>>> requests with the same req_pfn/resp_pfn, but on the KVM side whichever >>>> request issues rmp_make_private() first would succeed, and then the >>>> 2nd request would generate an EINVAL to userspace. >>>> >>>> In that sense, rmp_make_private()/snp_page_reclaim() sort of pair to >>>> lock/unlock a page that's being handed to the ASP. But this should be >>>> better documented either way. >>> >>> What about the host kernel though? I don't see anything here that ensures resp_pfn >>> isn't "regular" memory, i.e. that ensure the page isn't being concurrently accessed >>> by the host kernel (or some other userspace process). >>> >>> Or is the "private" memory still accessible by the host? >> >> It's accessible, but it is immutable according to RMP table, so so it would >> require KVM to be elsewhere doing a write to the page, > > I take it "immutable" means "read-only"? If so, it would be super helpful to > document that in the APM. I assumed "immutable" only meant that the RMP entry > itself is immutable, and that Assigned=AMD-SP is what prevented host accesses. Not quite. It depends on the page state associated with the page. For example, Hypervisor-Fixed pages have the immutable bit set, but can be read and written. The page states are documented in the SNP API (Chapter 5, Page Management): https://www.amd.com/content/dam/amd/en/documents/epyc-technical-docs/specifications/56860.pdf Thanks, Tom > >> but that seems possible if the guest is misbehaved. So I do think the RMP #PF >> concerns are warranted, and that looking at using KVM-allocated >> intermediary/"bounce" pages to pass to firmware is definitely worth looking >> into for v2 as that's just about the safest way to guarantee nothing else >> will be writing to the page after it gets set to immutable/firmware-owned.
On Thu, Jun 27, 2024, Tom Lendacky wrote: > On 6/26/24 14:54, Sean Christopherson wrote: > > On Wed, Jun 26, 2024, Michael Roth wrote: > >>> What about the host kernel though? I don't see anything here that ensures resp_pfn > >>> isn't "regular" memory, i.e. that ensure the page isn't being concurrently accessed > >>> by the host kernel (or some other userspace process). > >>> > >>> Or is the "private" memory still accessible by the host? > >> > >> It's accessible, but it is immutable according to RMP table, so so it would > >> require KVM to be elsewhere doing a write to the page, > > > > I take it "immutable" means "read-only"? If so, it would be super helpful to > > document that in the APM. I assumed "immutable" only meant that the RMP entry > > itself is immutable, and that Assigned=AMD-SP is what prevented host accesses. > > Not quite. It depends on the page state associated with the page. For > example, Hypervisor-Fixed pages have the immutable bit set, but can be > read and written. > > The page states are documented in the SNP API (Chapter 5, Page > Management): Heh, but then that section says: Pages in the Firmware state are owned by the firmware. Because the RMP.Immutable ^^^^^^^^^^^^^^^^^^^^^^^^^ bit is set, the hypervisor cannot write to Firmware pages nor alter the RMP entry ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ with the RMPUPDATE instruction. which to me very clearly suggests that the RMP.Immutable bit is what makes the page read-only. Can you ask^Wbribe someone to add a "Table 11. Page State Properties" or something? E.g. to explicitly list out the read vs. write protections and the state of the page's data (encrypted, integrity-protected, zeroed?, etc). I've read through all of "5.2 Page States" and genuinely have no clue as to what protections most of the states have. Ah, never mind, I found "Table 15-39. RMP Memory Access Checks" in the APM. FWIW, that somewhat contradicts this blurb from the SNP ABI spec: The content of a Context page is encrypted and integrity protected so that the hypervisor cannot read or write to it. I also find that statement confusing. IIUC, the fact that the Context page is encrypted and integrity protected doesn't actually have anything to do with the host's ability to access the data. The host _can_ read the data, but it will get ciphertext. But the host can't write the data because the page isn't HV-owned. Actually, isn't the intregrity protected part wrong? I thought one of the benefits of SNP vs. ES is that the RMP means the VMSA doesn't have to be integrity protected, and so VMRUN and #VMEXIT transitions are faster because the CPU doesn't need to do as much work.
On Thu, Jun 27, 2024 at 9:35 AM Sean Christopherson <seanjc@google.com> wrote: > > On Thu, Jun 27, 2024, Tom Lendacky wrote: > > On 6/26/24 14:54, Sean Christopherson wrote: > > > On Wed, Jun 26, 2024, Michael Roth wrote: > > >>> What about the host kernel though? I don't see anything here that ensures resp_pfn > > >>> isn't "regular" memory, i.e. that ensure the page isn't being concurrently accessed > > >>> by the host kernel (or some other userspace process). > > >>> > > >>> Or is the "private" memory still accessible by the host? > > >> > > >> It's accessible, but it is immutable according to RMP table, so so it would > > >> require KVM to be elsewhere doing a write to the page, > > > > > > I take it "immutable" means "read-only"? If so, it would be super helpful to > > > document that in the APM. I assumed "immutable" only meant that the RMP entry > > > itself is immutable, and that Assigned=AMD-SP is what prevented host accesses. > > > > Not quite. It depends on the page state associated with the page. For > > example, Hypervisor-Fixed pages have the immutable bit set, but can be > > read and written. > > > > The page states are documented in the SNP API (Chapter 5, Page > > Management): > > Heh, but then that section says: > > Pages in the Firmware state are owned by the firmware. Because the RMP.Immutable > ^^^^^^^^^^^^^^^^^^^^^^^^^ > bit is set, the hypervisor cannot write to Firmware pages nor alter the RMP entry > ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ > with the RMPUPDATE instruction. > > which to me very clearly suggests that the RMP.Immutable bit is what makes the > page read-only. > > Can you ask^Wbribe someone to add a "Table 11. Page State Properties" or something? > E.g. to explicitly list out the read vs. write protections and the state of the > page's data (encrypted, integrity-protected, zeroed?, etc). I've read through > all of "5.2 Page States" and genuinely have no clue as to what protections most > of the states have. > > Ah, never mind, I found "Table 15-39. RMP Memory Access Checks" in the APM. FWIW, > that somewhat contradicts this blurb from the SNP ABI spec: > > The content of a Context page is encrypted and integrity protected so that the > hypervisor cannot read or write to it. > > I also find that statement confusing. IIUC, the fact that the Context page is > encrypted and integrity protected doesn't actually have anything to do with the > host's ability to access the data. The host _can_ read the data, but it will get > ciphertext. But the host can't write the data because the page isn't HV-owned. > > Actually, isn't the intregrity protected part wrong? I thought one of the benefits > of SNP vs. ES is that the RMP means the VMSA doesn't have to be integrity protected, > and so VMRUN and #VMEXIT transitions are faster because the CPU doesn't need to do > as much work. The statement is fairly vague so its a bit confusing that the encryption scheme on the Context page doesn't include integrity. In reality the encryption is AES-XTS with a physical address tweak so no integrity. The integrity comes purely from the RMP with SNP, but it's still integrity protected right? So I don't think that part is wrong.
On 6/27/24 10:35, Sean Christopherson wrote: > On Thu, Jun 27, 2024, Tom Lendacky wrote: >> On 6/26/24 14:54, Sean Christopherson wrote: >>> On Wed, Jun 26, 2024, Michael Roth wrote: >>>>> What about the host kernel though? I don't see anything here that ensures resp_pfn >>>>> isn't "regular" memory, i.e. that ensure the page isn't being concurrently accessed >>>>> by the host kernel (or some other userspace process). >>>>> >>>>> Or is the "private" memory still accessible by the host? >>>> >>>> It's accessible, but it is immutable according to RMP table, so so it would >>>> require KVM to be elsewhere doing a write to the page, >>> >>> I take it "immutable" means "read-only"? If so, it would be super helpful to >>> document that in the APM. I assumed "immutable" only meant that the RMP entry >>> itself is immutable, and that Assigned=AMD-SP is what prevented host accesses. >> >> Not quite. It depends on the page state associated with the page. For >> example, Hypervisor-Fixed pages have the immutable bit set, but can be >> read and written. >> >> The page states are documented in the SNP API (Chapter 5, Page >> Management): > > Heh, but then that section says: > > Pages in the Firmware state are owned by the firmware. Because the RMP.Immutable > ^^^^^^^^^^^^^^^^^^^^^^^^^ > bit is set, the hypervisor cannot write to Firmware pages nor alter the RMP entry > ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ > with the RMPUPDATE instruction. > > which to me very clearly suggests that the RMP.Immutable bit is what makes the > page read-only. > > Can you ask^Wbribe someone to add a "Table 11. Page State Properties" or something? > E.g. to explicitly list out the read vs. write protections and the state of the > page's data (encrypted, integrity-protected, zeroed?, etc). I've read through > all of "5.2 Page States" and genuinely have no clue as to what protections most > of the states have. I'll get with the document owner and provide that feedback and see what we can do to remove some of the ambiguity and improve upon it. > > Ah, never mind, I found "Table 15-39. RMP Memory Access Checks" in the APM. FWIW, > that somewhat contradicts this blurb from the SNP ABI spec: > > The content of a Context page is encrypted and integrity protected so that the > hypervisor cannot read or write to it. > > I also find that statement confusing. IIUC, the fact that the Context page is > encrypted and integrity protected doesn't actually have anything to do with the > host's ability to access the data. The host _can_ read the data, but it will get > ciphertext. But the host can't write the data because the page isn't HV-owned. > > Actually, isn't the intregrity protected part wrong? I thought one of the benefits The RMP protection is what helps provide the integrity protection. So if a hypervisor tries to write to a non-hypervisor owned page, it will generate an RMP #PF. If the page can't be RMPUPDATEd (the immutable bit is set for the page in the RMP), then the page can't be written to by the hypervisor. If the page can be RMPUPDATEd and made hypervisor-owned and was previously PVALIDATEd, then a guest access (as private) will generate a #NPF. If the hypervisor then assigns the page to the guest (after having possibly written to the page), the guest will then get a #VC and detect that the integrity of the page may have been compromised and it should take action. > of SNP vs. ES is that the RMP means the VMSA doesn't have to be integrity protected, > and so VMRUN and #VMEXIT transitions are faster because the CPU doesn't need to do > as much work. That is one of the benefits. A page can only be turned into a VMSA page via an SNP_LAUNCH_UPDATE (becoming part of the guest measurement) or via RMPADJUST (which can only be executed in the guest), making it a guest private page. If an RMPUPDATE is done by the hypervisor, the VMSA bit will be cleared and the VMSA won't be runnable anymore. Thanks, Tom
On Thu, Jun 27, 2024, Tom Lendacky wrote: > On 6/27/24 10:35, Sean Christopherson wrote: > >> The page states are documented in the SNP API (Chapter 5, Page > >> Management): > > > > Heh, but then that section says: > > > > Pages in the Firmware state are owned by the firmware. Because the RMP.Immutable > > ^^^^^^^^^^^^^^^^^^^^^^^^^ > > bit is set, the hypervisor cannot write to Firmware pages nor alter the RMP entry > > ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ > > with the RMPUPDATE instruction. > > > > which to me very clearly suggests that the RMP.Immutable bit is what makes the > > page read-only. > > > > Can you ask^Wbribe someone to add a "Table 11. Page State Properties" or something? > > E.g. to explicitly list out the read vs. write protections and the state of the > > page's data (encrypted, integrity-protected, zeroed?, etc). I've read through > > all of "5.2 Page States" and genuinely have no clue as to what protections most > > of the states have. > > I'll get with the document owner and provide that feedback and see what we > can do to remove some of the ambiguity and improve upon it. Thanks! > > Ah, never mind, I found "Table 15-39. RMP Memory Access Checks" in the APM. FWIW, > > that somewhat contradicts this blurb from the SNP ABI spec: > > > > The content of a Context page is encrypted and integrity protected so that the > > hypervisor cannot read or write to it. > > > > I also find that statement confusing. IIUC, the fact that the Context page is > > encrypted and integrity protected doesn't actually have anything to do with the > > host's ability to access the data. The host _can_ read the data, but it will get > > ciphertext. But the host can't write the data because the page isn't HV-owned. > > > > Actually, isn't the intregrity protected part wrong? I thought one of the benefits > > The RMP protection is what helps provide the integrity protection. So if a > hypervisor tries to write to a non-hypervisor owned page, it will generate > an RMP #PF. If the page can't be RMPUPDATEd (the immutable bit is set for > the page in the RMP), then the page can't be written to by the hypervisor. My confusion (ok, maybe it's more annoyance than true confusion) is that that applies to _all_ non-hypervisor pages, not just Context pages. Reading things from a "the exception proves the rule" viewpoint, stating that Context pages are encrypted and integrity protected strongly suggests that all other pages are NOT encrypted and NOT integrity protected.
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index df8818759698..7338b987cadd 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -19,6 +19,7 @@ #include <linux/misc_cgroup.h> #include <linux/processor.h> #include <linux/trace_events.h> +#include <uapi/linux/sev-guest.h> #include <asm/pkru.h> #include <asm/trapnr.h> @@ -3321,6 +3322,10 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm) if (!sev_snp_guest(vcpu->kvm) || !kvm_ghcb_sw_scratch_is_valid(svm)) goto vmgexit_err; break; + case SVM_VMGEXIT_GUEST_REQUEST: + if (!sev_snp_guest(vcpu->kvm)) + goto vmgexit_err; + break; default: reason = GHCB_ERR_INVALID_EVENT; goto vmgexit_err; @@ -3939,6 +3944,67 @@ static int sev_snp_ap_creation(struct vcpu_svm *svm) return ret; } +static int snp_handle_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t resp_gpa) +{ + struct sev_data_snp_guest_request data = {0}; + struct kvm *kvm = svm->vcpu.kvm; + kvm_pfn_t req_pfn, resp_pfn; + sev_ret_code fw_err = 0; + int ret; + + if (!sev_snp_guest(kvm) || !PAGE_ALIGNED(req_gpa) || !PAGE_ALIGNED(resp_gpa)) + return -EINVAL; + + req_pfn = gfn_to_pfn(kvm, gpa_to_gfn(req_gpa)); + if (is_error_noslot_pfn(req_pfn)) + return -EINVAL; + + resp_pfn = gfn_to_pfn(kvm, gpa_to_gfn(resp_gpa)); + if (is_error_noslot_pfn(resp_pfn)) { + ret = EINVAL; + goto release_req; + } + + if (rmp_make_private(resp_pfn, 0, PG_LEVEL_4K, 0, true)) { + ret = -EINVAL; + kvm_release_pfn_clean(resp_pfn); + goto release_req; + } + + data.gctx_paddr = __psp_pa(to_kvm_sev_info(kvm)->snp_context); + data.req_paddr = __sme_set(req_pfn << PAGE_SHIFT); + data.res_paddr = __sme_set(resp_pfn << PAGE_SHIFT); + + ret = sev_issue_cmd(kvm, SEV_CMD_SNP_GUEST_REQUEST, &data, &fw_err); + if (ret) + return ret; + + /* + * If reclaim fails then there's a good chance the guest will no longer + * be runnable so just let userspace terminate the guest. + */ + if (snp_page_reclaim(kvm, resp_pfn)) { + return -EIO; + goto release_req; + } + + /* + * As per GHCB spec, firmware failures should be communicated back to + * the guest via SW_EXITINFO2 rather than be treated as immediately + * fatal. + */ + ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, + SNP_GUEST_ERR(ret ? SNP_GUEST_VMM_ERR_GENERIC : 0, + fw_err)); + + ret = 1; /* resume guest */ + kvm_release_pfn_dirty(resp_pfn); + +release_req: + kvm_release_pfn_clean(req_pfn); + return ret; +} + static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) { struct vmcb_control_area *control = &svm->vmcb->control; @@ -4213,6 +4279,9 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) ret = 1; break; + case SVM_VMGEXIT_GUEST_REQUEST: + ret = snp_handle_guest_req(svm, control->exit_info_1, control->exit_info_2); + break; case SVM_VMGEXIT_UNSUPPORTED_EVENT: vcpu_unimpl(vcpu, "vmgexit: unsupported event - exit_info_1=%#llx, exit_info_2=%#llx\n", diff --git a/include/uapi/linux/sev-guest.h b/include/uapi/linux/sev-guest.h index 154a87a1eca9..7bd78e258569 100644 --- a/include/uapi/linux/sev-guest.h +++ b/include/uapi/linux/sev-guest.h @@ -89,8 +89,17 @@ struct snp_ext_report_req { #define SNP_GUEST_FW_ERR_MASK GENMASK_ULL(31, 0) #define SNP_GUEST_VMM_ERR_SHIFT 32 #define SNP_GUEST_VMM_ERR(x) (((u64)x) << SNP_GUEST_VMM_ERR_SHIFT) +#define SNP_GUEST_FW_ERR(x) ((x) & SNP_GUEST_FW_ERR_MASK) +#define SNP_GUEST_ERR(vmm_err, fw_err) (SNP_GUEST_VMM_ERR(vmm_err) | \ + SNP_GUEST_FW_ERR(fw_err)) +/* + * The GHCB spec only formally defines INVALID_LEN/BUSY VMM errors, but define + * a GENERIC error code such that it won't ever conflict with GHCB-defined + * errors if any get added in the future. + */ #define SNP_GUEST_VMM_ERR_INVALID_LEN 1 #define SNP_GUEST_VMM_ERR_BUSY 2 +#define SNP_GUEST_VMM_ERR_GENERIC BIT(31) #endif /* __UAPI_LINUX_SEV_GUEST_H_ */