Message ID | 3a51840f6a80c87b39632dc728dbd9b5dd444cd7.1655761627.git.ashish.kalra@amd.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Add AMD Secure Nested Paging (SEV-SNP) | expand |
On Mon, Jun 20, 2022 at 5:05 PM Ashish Kalra <Ashish.Kalra@amd.com> wrote: > > From: Brijesh Singh <brijesh.singh@amd.com> > > The behavior and requirement for the SEV-legacy command is altered when > the SNP firmware is in the INIT state. See SEV-SNP firmware specification > for more details. > > Allocate the Trusted Memory Region (TMR) as a 2mb sized/aligned region > when SNP is enabled to satify new requirements for the SNP. Continue satisfy > allocating a 1mb region for !SNP configuration. > > While at it, provide API that can be used by others to allocate a page > that can be used by the firmware. The immediate user for this API will > be the KVM driver. The KVM driver to need to allocate a firmware context > page during the guest creation. The context page need to be updated > by the firmware. See the SEV-SNP specification for further details. > > Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> > --- > drivers/crypto/ccp/sev-dev.c | 173 +++++++++++++++++++++++++++++++++-- > include/linux/psp-sev.h | 11 +++ > 2 files changed, 178 insertions(+), 6 deletions(-) > > diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c > index 35d76333e120..0dbd99f29b25 100644 > --- a/drivers/crypto/ccp/sev-dev.c > +++ b/drivers/crypto/ccp/sev-dev.c > @@ -79,6 +79,14 @@ static void *sev_es_tmr; > #define NV_LENGTH (32 * 1024) > static void *sev_init_ex_buffer; > > +/* When SEV-SNP is enabled the TMR needs to be 2MB aligned and 2MB size. */ > +#define SEV_SNP_ES_TMR_SIZE (2 * 1024 * 1024) > + > +static size_t sev_es_tmr_size = SEV_ES_TMR_SIZE; Why not keep all this TMR stuff together near the SEV_ES_TMR_SIZE define? > + > +static int __sev_do_cmd_locked(int cmd, void *data, int *psp_ret); > +static int sev_do_cmd(int cmd, void *data, int *psp_ret); > + > static inline bool sev_version_greater_or_equal(u8 maj, u8 min) > { > struct sev_device *sev = psp_master->sev_data; > @@ -177,11 +185,161 @@ static int sev_cmd_buffer_len(int cmd) > return 0; > } > > +static void snp_leak_pages(unsigned long pfn, unsigned int npages) > +{ > + WARN(1, "psc failed, pfn 0x%lx pages %d (leaking)\n", pfn, npages); > + while (npages--) { > + memory_failure(pfn, 0); > + dump_rmpentry(pfn); > + pfn++; > + } > +} > + > +static int snp_reclaim_pages(unsigned long pfn, unsigned int npages, bool locked) > +{ > + struct sev_data_snp_page_reclaim data; > + int ret, err, i, n = 0; > + > + for (i = 0; i < npages; i++) { What about setting |n| here too, also the other increments. for (i = 0, n = 0; i < npages; i++, n++, pfn++) > + memset(&data, 0, sizeof(data)); > + data.paddr = pfn << PAGE_SHIFT; > + > + if (locked) > + ret = __sev_do_cmd_locked(SEV_CMD_SNP_PAGE_RECLAIM, &data, &err); > + else > + ret = sev_do_cmd(SEV_CMD_SNP_PAGE_RECLAIM, &data, &err); Can we change `sev_cmd_mutex` to some sort of nesting lock type? That could clean up this if (locked) code. > + if (ret) > + goto cleanup; > + > + ret = rmp_make_shared(pfn, PG_LEVEL_4K); > + if (ret) > + goto cleanup; > + > + pfn++; > + n++; > + } > + > + return 0; > + > +cleanup: > + /* > + * If failed to reclaim the page then page is no longer safe to > + * be released, leak it. > + */ > + snp_leak_pages(pfn, npages - n); > + return ret; > +} > + > +static inline int rmp_make_firmware(unsigned long pfn, int level) > +{ > + return rmp_make_private(pfn, 0, level, 0, true); > +} > + > +static int snp_set_rmp_state(unsigned long paddr, unsigned int npages, bool to_fw, bool locked, > + bool need_reclaim) This function can do a lot and when I read the call sites its hard to see what its doing since we have a combination of arguments which tell us what behavior is happening, some of which are not valid (ex: to_fw == true and need_reclaim == true is an invalid argument combination). Also this for loop over |npages| is duplicated from snp_reclaim_pages(). One improvement here is that on the current snp_reclaim_pages() if we fail to reclaim a page we assume we cannot reclaim the next pages, this may cause us to snp_leak_pages() more pages than we actually need too. What about something like this? static snp_leak_page(u64 pfn, enum pg_level level) { memory_failure(pfn, 0); dump_rmpentry(pfn); } static int snp_reclaim_page(u64 pfn, enum pg_level level) { int ret; struct sev_data_snp_page_reclaim data; ret = sev_do_cmd(SEV_CMD_SNP_PAGE_RECLAIM, &data, &err); if (ret) goto cleanup; ret = rmp_make_shared(pfn, level); if (ret) goto cleanup; return 0; cleanup: snp_leak_page(pfn, level) } typedef int (*rmp_state_change_func) (u64 pfn, enum pg_level level); static int snp_set_rmp_state(unsigned long paddr, unsigned int npages, rmp_state_change_func state_change, rmp_state_change_func cleanup) { struct sev_data_snp_page_reclaim data; int ret, err, i, n = 0; for (i = 0, n = 0; i < npages; i++, n++, pfn++) { ret = state_change(pfn, PG_LEVEL_4K) if (ret) goto cleanup; } return 0; cleanup: for (; i>= 0; i--, n--, pfn--) { cleanup(pfn, PG_LEVEL_4K); } return ret; } Then inside of __snp_alloc_firmware_pages(): snp_set_rmp_state(paddr, npages, rmp_make_firmware, snp_reclaim_page); And inside of __snp_free_firmware_pages(): snp_set_rmp_state(paddr, npages, snp_reclaim_page, snp_leak_page); Just a suggestion feel free to ignore. The readability comment could be addressed much less invasively by just making separate functions for each valid combination of arguments here. Like snp_set_rmp_fw_state(), snp_set_rmp_shared_state(), snp_set_rmp_release_state() or something. > +{ > + unsigned long pfn = __sme_clr(paddr) >> PAGE_SHIFT; /* Cbit maybe set in the paddr */ > + int rc, n = 0, i; > + > + for (i = 0; i < npages; i++) { > + if (to_fw) > + rc = rmp_make_firmware(pfn, PG_LEVEL_4K); > + else > + rc = need_reclaim ? snp_reclaim_pages(pfn, 1, locked) : > + rmp_make_shared(pfn, PG_LEVEL_4K); > + if (rc) > + goto cleanup; > + > + pfn++; > + n++; > + } > + > + return 0; > + > +cleanup: > + /* Try unrolling the firmware state changes */ > + if (to_fw) { > + /* > + * Reclaim the pages which were already changed to the > + * firmware state. > + */ > + snp_reclaim_pages(paddr >> PAGE_SHIFT, n, locked); > + > + return rc; > + } > + > + /* > + * If failed to change the page state to shared, then its not safe > + * to release the page back to the system, leak it. > + */ > + snp_leak_pages(pfn, npages - n); > + > + return rc; > +} > + > +static struct page *__snp_alloc_firmware_pages(gfp_t gfp_mask, int order, bool locked) > +{ > + unsigned long npages = 1ul << order, paddr; > + struct sev_device *sev; > + struct page *page; > + > + if (!psp_master || !psp_master->sev_data) > + return NULL; > + > + page = alloc_pages(gfp_mask, order); > + if (!page) > + return NULL; > + > + /* If SEV-SNP is initialized then add the page in RMP table. */ > + sev = psp_master->sev_data; > + if (!sev->snp_inited) > + return page; > + > + paddr = __pa((unsigned long)page_address(page)); > + if (snp_set_rmp_state(paddr, npages, true, locked, false)) > + return NULL; So what about the case where snp_set_rmp_state() fails but we were able to reclaim all the pages? Should we be able to signal that to callers so that we could free |page| here? But given this is an error path already maybe we can optimize this in a follow up series. > + > + return page; > +} > + > +void *snp_alloc_firmware_page(gfp_t gfp_mask) > +{ > + struct page *page; > + > + page = __snp_alloc_firmware_pages(gfp_mask, 0, false); > + > + return page ? page_address(page) : NULL; > +} > +EXPORT_SYMBOL_GPL(snp_alloc_firmware_page); > + > +static void __snp_free_firmware_pages(struct page *page, int order, bool locked) > +{ > + unsigned long paddr, npages = 1ul << order; > + > + if (!page) > + return; > + > + paddr = __pa((unsigned long)page_address(page)); > + if (snp_set_rmp_state(paddr, npages, false, locked, true)) > + return; Here we may be able to free some of |page| depending how where inside of snp_set_rmp_state() we failed. But again given this is an error path already maybe we can optimize this in a follow up series. > + > + __free_pages(page, order); > +} > + > +void snp_free_firmware_page(void *addr) > +{ > + if (!addr) > + return; > + > + __snp_free_firmware_pages(virt_to_page(addr), 0, false); > +} > +EXPORT_SYMBOL(snp_free_firmware_page); > + > static void *sev_fw_alloc(unsigned long len) > { > struct page *page; > > - page = alloc_pages(GFP_KERNEL, get_order(len)); > + page = __snp_alloc_firmware_pages(GFP_KERNEL, get_order(len), false); > if (!page) > return NULL; > > @@ -393,7 +551,7 @@ static int __sev_init_locked(int *error) > data.tmr_address = __pa(sev_es_tmr); > > data.flags |= SEV_INIT_FLAGS_SEV_ES; > - data.tmr_len = SEV_ES_TMR_SIZE; > + data.tmr_len = sev_es_tmr_size; > } > > return __sev_do_cmd_locked(SEV_CMD_INIT, &data, error); > @@ -421,7 +579,7 @@ static int __sev_init_ex_locked(int *error) > data.tmr_address = __pa(sev_es_tmr); > > data.flags |= SEV_INIT_FLAGS_SEV_ES; > - data.tmr_len = SEV_ES_TMR_SIZE; > + data.tmr_len = sev_es_tmr_size; > } > > return __sev_do_cmd_locked(SEV_CMD_INIT_EX, &data, error); > @@ -818,6 +976,8 @@ static int __sev_snp_init_locked(int *error) > sev->snp_inited = true; > dev_dbg(sev->dev, "SEV-SNP firmware initialized\n"); > > + sev_es_tmr_size = SEV_SNP_ES_TMR_SIZE; > + > return rc; > } > > @@ -1341,8 +1501,9 @@ static void sev_firmware_shutdown(struct sev_device *sev) > /* The TMR area was encrypted, flush it from the cache */ > wbinvd_on_all_cpus(); > > - free_pages((unsigned long)sev_es_tmr, > - get_order(SEV_ES_TMR_SIZE)); > + __snp_free_firmware_pages(virt_to_page(sev_es_tmr), > + get_order(sev_es_tmr_size), > + false); > sev_es_tmr = NULL; > } > > @@ -1430,7 +1591,7 @@ void sev_pci_init(void) > } > > /* Obtain the TMR memory area for SEV-ES use */ > - sev_es_tmr = sev_fw_alloc(SEV_ES_TMR_SIZE); > + sev_es_tmr = sev_fw_alloc(sev_es_tmr_size); > if (!sev_es_tmr) > dev_warn(sev->dev, > "SEV: TMR allocation failed, SEV-ES support unavailable\n"); > diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h > index 9f921d221b75..a3bb792bb842 100644 > --- a/include/linux/psp-sev.h > +++ b/include/linux/psp-sev.h > @@ -12,6 +12,8 @@ > #ifndef __PSP_SEV_H__ > #define __PSP_SEV_H__ > > +#include <linux/sev.h> > + > #include <uapi/linux/psp-sev.h> > > #ifdef CONFIG_X86 > @@ -940,6 +942,8 @@ int snp_guest_page_reclaim(struct sev_data_snp_page_reclaim *data, int *error); > int snp_guest_dbg_decrypt(struct sev_data_snp_dbg *data, int *error); > > void *psp_copy_user_blob(u64 uaddr, u32 len); > +void *snp_alloc_firmware_page(gfp_t mask); > +void snp_free_firmware_page(void *addr); > > #else /* !CONFIG_CRYPTO_DEV_SP_PSP */ > > @@ -981,6 +985,13 @@ static inline int snp_guest_dbg_decrypt(struct sev_data_snp_dbg *data, int *erro > return -ENODEV; > } > > +static inline void *snp_alloc_firmware_page(gfp_t mask) > +{ > + return NULL; > +} > + > +static inline void snp_free_firmware_page(void *addr) { } > + > #endif /* CONFIG_CRYPTO_DEV_SP_PSP */ > > #endif /* __PSP_SEV_H__ */ > -- > 2.25.1 >
[Public] Hello Peter, >> +static int snp_reclaim_pages(unsigned long pfn, unsigned int npages, >> +bool locked) { >> + struct sev_data_snp_page_reclaim data; >> + int ret, err, i, n = 0; >> + >> + for (i = 0; i < npages; i++) { >What about setting |n| here too, also the other increments. >for (i = 0, n = 0; i < npages; i++, n++, pfn++) Yes that is simpler. >> + memset(&data, 0, sizeof(data)); >> + data.paddr = pfn << PAGE_SHIFT; >> + >> + if (locked) >> + ret = __sev_do_cmd_locked(SEV_CMD_SNP_PAGE_RECLAIM, &data, &err); >> + else >> + ret = sev_do_cmd(SEV_CMD_SNP_PAGE_RECLAIM, >> + &data, &err); > Can we change `sev_cmd_mutex` to some sort of nesting lock type? That could clean up this if (locked) code. > +static inline int rmp_make_firmware(unsigned long pfn, int level) { > + return rmp_make_private(pfn, 0, level, 0, true); } > + > +static int snp_set_rmp_state(unsigned long paddr, unsigned int npages, bool to_fw, bool locked, > + bool need_reclaim) >This function can do a lot and when I read the call sites its hard to see what its doing since we have a combination of arguments which tell us what behavior is happening, some of which are not valid (ex: to_fw == true and need_reclaim == true is an >invalid argument combination). to_fw is used to make a firmware page and need_reclaim is for freeing the firmware page, so they are going to be mutually exclusive. I actually can connect with it quite logically with the callers : snp_alloc_firmware_pages will call with to_fw = true and need_reclaim = false and snp_free_firmware_pages will do the opposite, to_fw = false and need_reclaim = true. That seems straightforward to look at. >Also this for loop over |npages| is duplicated from snp_reclaim_pages(). One improvement here is that on the current >snp_reclaim_pages() if we fail to reclaim a page we assume we cannot reclaim the next pages, this may cause us to snp_leak_pages() more pages than we actually need too. Yes that is true. >What about something like this? >static snp_leak_page(u64 pfn, enum pg_level level) { > memory_failure(pfn, 0); > dump_rmpentry(pfn); >} >static int snp_reclaim_page(u64 pfn, enum pg_level level) { > int ret; > struct sev_data_snp_page_reclaim data; > ret = sev_do_cmd(SEV_CMD_SNP_PAGE_RECLAIM, &data, &err); > if (ret) > goto cleanup; > ret = rmp_make_shared(pfn, level); > if (ret) > goto cleanup; > return 0; >cleanup: > snp_leak_page(pfn, level) >} >typedef int (*rmp_state_change_func) (u64 pfn, enum pg_level level); >static int snp_set_rmp_state(unsigned long paddr, unsigned int npages, rmp_state_change_func state_change, rmp_state_change_func cleanup) { > struct sev_data_snp_page_reclaim data; > int ret, err, i, n = 0; > for (i = 0, n = 0; i < npages; i++, n++, pfn++) { > ret = state_change(pfn, PG_LEVEL_4K) > if (ret) > goto cleanup; > } > return 0; > cleanup: > for (; i>= 0; i--, n--, pfn--) { > cleanup(pfn, PG_LEVEL_4K); > } > return ret; >} >Then inside of __snp_alloc_firmware_pages(): >snp_set_rmp_state(paddr, npages, rmp_make_firmware, snp_reclaim_page); >And inside of __snp_free_firmware_pages(): >snp_set_rmp_state(paddr, npages, snp_reclaim_page, snp_leak_page); >Just a suggestion feel free to ignore. The readability comment could be addressed much less invasively by just making separate functions for each valid combination of arguments here. Like snp_set_rmp_fw_state(), snp_set_rmp_shared_state(), >snp_set_rmp_release_state() or something. >> +static struct page *__snp_alloc_firmware_pages(gfp_t gfp_mask, int >> +order, bool locked) { >> + unsigned long npages = 1ul << order, paddr; >> + struct sev_device *sev; >> + struct page *page; >> + >> + if (!psp_master || !psp_master->sev_data) >> + return NULL; >> + >> + page = alloc_pages(gfp_mask, order); >> + if (!page) >> + return NULL; >> + >> + /* If SEV-SNP is initialized then add the page in RMP table. */ >> + sev = psp_master->sev_data; >> + if (!sev->snp_inited) >> + return page; >> + >> + paddr = __pa((unsigned long)page_address(page)); >> + if (snp_set_rmp_state(paddr, npages, true, locked, false)) >> + return NULL; >So what about the case where snp_set_rmp_state() fails but we were able to reclaim all the pages? Should we be able to signal that to callers so that we could free |page| here? But given this is an error path already maybe we can optimize this in a >follow up series. Yes, we should actually tie in to snp_reclaim_pages() success or failure here in the case we were able to successfully unroll some or all of the firmware state change. > + > + return page; > +} > + > +void *snp_alloc_firmware_page(gfp_t gfp_mask) { > + struct page *page; > + > + page = __snp_alloc_firmware_pages(gfp_mask, 0, false); > + > + return page ? page_address(page) : NULL; } > +EXPORT_SYMBOL_GPL(snp_alloc_firmware_page); > + > +static void __snp_free_firmware_pages(struct page *page, int order, > +bool locked) { > + unsigned long paddr, npages = 1ul << order; > + > + if (!page) > + return; > + > + paddr = __pa((unsigned long)page_address(page)); > + if (snp_set_rmp_state(paddr, npages, false, locked, true)) > + return; > Here we may be able to free some of |page| depending how where inside of snp_set_rmp_state() we failed. But again given this is an error path already maybe we can optimize this in a follow up series. Yes, we probably should be able to free some of the page(s) depending on how many page(s) got reclaimed in snp_set_rmp_state(). But these reclamation failures may not be very common, so any failure is indicative of a bigger issue, it might be the case when there is a single page reclamation error it might happen with all the subsequent pages and so follow a simple recovery procedure, then handling a more complex recovery for a chunk of pages being reclaimed and another chunk not. Thanks, Ashish
On Tue, Jun 21, 2022 at 2:17 PM Kalra, Ashish <Ashish.Kalra@amd.com> wrote: > > [Public] > > Hello Peter, > > >> +static int snp_reclaim_pages(unsigned long pfn, unsigned int npages, > >> +bool locked) { > >> + struct sev_data_snp_page_reclaim data; > >> + int ret, err, i, n = 0; > >> + > >> + for (i = 0; i < npages; i++) { > > >What about setting |n| here too, also the other increments. > > >for (i = 0, n = 0; i < npages; i++, n++, pfn++) > > Yes that is simpler. > > >> + memset(&data, 0, sizeof(data)); > >> + data.paddr = pfn << PAGE_SHIFT; > >> + > >> + if (locked) > >> + ret = __sev_do_cmd_locked(SEV_CMD_SNP_PAGE_RECLAIM, &data, &err); > >> + else > >> + ret = sev_do_cmd(SEV_CMD_SNP_PAGE_RECLAIM, > >> + &data, &err); > > > Can we change `sev_cmd_mutex` to some sort of nesting lock type? That could clean up this if (locked) code. > > > +static inline int rmp_make_firmware(unsigned long pfn, int level) { > > + return rmp_make_private(pfn, 0, level, 0, true); } > > + > > +static int snp_set_rmp_state(unsigned long paddr, unsigned int npages, bool to_fw, bool locked, > > + bool need_reclaim) > > >This function can do a lot and when I read the call sites its hard to see what its doing since we have a combination of arguments which tell us what behavior is happening, some of which are not valid (ex: to_fw == true and need_reclaim == true is an >invalid argument combination). > > to_fw is used to make a firmware page and need_reclaim is for freeing the firmware page, so they are going to be mutually exclusive. > > I actually can connect with it quite logically with the callers : > snp_alloc_firmware_pages will call with to_fw = true and need_reclaim = false > and snp_free_firmware_pages will do the opposite, to_fw = false and need_reclaim = true. > > That seems straightforward to look at. This might be a preference thing but I find it not straightforward. When I am reading through unmap_firmware_writeable() and I see /* Transition the pre-allocated buffer to the firmware state. */ if (snp_set_rmp_state(__pa(map->host), npages, true, true, false)) return -EFAULT; I don't actually know what snp_set_rmp_state() is doing unless I go look at the definition and see what all those booleans mean. This is unlike the rmp_make_shared() and rmp_make_private() functions, each of which tells me a lot more about what the function will do just from the name. > > >Also this for loop over |npages| is duplicated from snp_reclaim_pages(). One improvement here is that on the current > >snp_reclaim_pages() if we fail to reclaim a page we assume we cannot reclaim the next pages, this may cause us to snp_leak_pages() more pages than we actually need too. > > Yes that is true. > > >What about something like this? > > >static snp_leak_page(u64 pfn, enum pg_level level) { > > memory_failure(pfn, 0); > > dump_rmpentry(pfn); > >} > > >static int snp_reclaim_page(u64 pfn, enum pg_level level) { > > int ret; > > struct sev_data_snp_page_reclaim data; > > > ret = sev_do_cmd(SEV_CMD_SNP_PAGE_RECLAIM, &data, &err); > > if (ret) > > goto cleanup; > > > ret = rmp_make_shared(pfn, level); > > if (ret) > > goto cleanup; > > > return 0; > > >cleanup: > > snp_leak_page(pfn, level) > >} > > >typedef int (*rmp_state_change_func) (u64 pfn, enum pg_level level); > > >static int snp_set_rmp_state(unsigned long paddr, unsigned int npages, rmp_state_change_func state_change, rmp_state_change_func cleanup) { > > struct sev_data_snp_page_reclaim data; > > int ret, err, i, n = 0; > > > for (i = 0, n = 0; i < npages; i++, n++, pfn++) { > > ret = state_change(pfn, PG_LEVEL_4K) > > if (ret) > > goto cleanup; > > } > > > return 0; > > > cleanup: > > for (; i>= 0; i--, n--, pfn--) { > > cleanup(pfn, PG_LEVEL_4K); > > } > > > return ret; > >} > > >Then inside of __snp_alloc_firmware_pages(): > > >snp_set_rmp_state(paddr, npages, rmp_make_firmware, snp_reclaim_page); > > >And inside of __snp_free_firmware_pages(): > > >snp_set_rmp_state(paddr, npages, snp_reclaim_page, snp_leak_page); > > >Just a suggestion feel free to ignore. The readability comment could be addressed much less invasively by just making separate functions for each valid combination of arguments here. Like snp_set_rmp_fw_state(), snp_set_rmp_shared_state(), > >snp_set_rmp_release_state() or something. > > >> +static struct page *__snp_alloc_firmware_pages(gfp_t gfp_mask, int > >> +order, bool locked) { > >> + unsigned long npages = 1ul << order, paddr; > >> + struct sev_device *sev; > >> + struct page *page; > >> + > >> + if (!psp_master || !psp_master->sev_data) > >> + return NULL; > >> + > >> + page = alloc_pages(gfp_mask, order); > >> + if (!page) > >> + return NULL; > >> + > >> + /* If SEV-SNP is initialized then add the page in RMP table. */ > >> + sev = psp_master->sev_data; > >> + if (!sev->snp_inited) > >> + return page; > >> + > >> + paddr = __pa((unsigned long)page_address(page)); > >> + if (snp_set_rmp_state(paddr, npages, true, locked, false)) > >> + return NULL; > > >So what about the case where snp_set_rmp_state() fails but we were able to reclaim all the pages? Should we be able to signal that to callers so that we could free |page| here? But given this is an error path already maybe we can optimize this in a >follow up series. > > Yes, we should actually tie in to snp_reclaim_pages() success or failure here in the case we were able to successfully unroll some or all of the firmware state change. > > > + > > + return page; > > +} > > + > > +void *snp_alloc_firmware_page(gfp_t gfp_mask) { > > + struct page *page; > > + > > + page = __snp_alloc_firmware_pages(gfp_mask, 0, false); > > + > > + return page ? page_address(page) : NULL; } > > +EXPORT_SYMBOL_GPL(snp_alloc_firmware_page); > > + > > +static void __snp_free_firmware_pages(struct page *page, int order, > > +bool locked) { > > + unsigned long paddr, npages = 1ul << order; > > + > > + if (!page) > > + return; > > + > > + paddr = __pa((unsigned long)page_address(page)); > > + if (snp_set_rmp_state(paddr, npages, false, locked, true)) > > + return; > > > Here we may be able to free some of |page| depending how where inside of snp_set_rmp_state() we failed. But again given this is an error path already maybe we can optimize this in a follow up series. > > Yes, we probably should be able to free some of the page(s) depending on how many page(s) got reclaimed in snp_set_rmp_state(). > But these reclamation failures may not be very common, so any failure is indicative of a bigger issue, it might be the case when there is a single page reclamation error it might happen with all the subsequent > pages and so follow a simple recovery procedure, then handling a more complex recovery for a chunk of pages being reclaimed and another chunk not. > > Thanks, > Ashish > > >
On Mon, Jun 20, 2022 at 11:05:01PM +0000, Ashish Kalra wrote: > From: Brijesh Singh <brijesh.singh@amd.com> > > The behavior and requirement for the SEV-legacy command is altered when > the SNP firmware is in the INIT state. See SEV-SNP firmware specification > for more details. > > Allocate the Trusted Memory Region (TMR) as a 2mb sized/aligned region > when SNP is enabled to satify new requirements for the SNP. Continue > allocating a 1mb region for !SNP configuration. > > While at it, provide API that can be used by others to allocate a page > that can be used by the firmware. The immediate user for this API will > be the KVM driver. The KVM driver to need to allocate a firmware context > page during the guest creation. The context page need to be updated > by the firmware. See the SEV-SNP specification for further details. > > Signed-off-by: Brijesh Singh <brijesh.singh@amd.com> > --- > drivers/crypto/ccp/sev-dev.c | 173 +++++++++++++++++++++++++++++++++-- > include/linux/psp-sev.h | 11 +++ > 2 files changed, 178 insertions(+), 6 deletions(-) > > diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c > index 35d76333e120..0dbd99f29b25 100644 > --- a/drivers/crypto/ccp/sev-dev.c > +++ b/drivers/crypto/ccp/sev-dev.c > @@ -79,6 +79,14 @@ static void *sev_es_tmr; > #define NV_LENGTH (32 * 1024) > static void *sev_init_ex_buffer; > > +/* When SEV-SNP is enabled the TMR needs to be 2MB aligned and 2MB size. */ > +#define SEV_SNP_ES_TMR_SIZE (2 * 1024 * 1024) > + > +static size_t sev_es_tmr_size = SEV_ES_TMR_SIZE; > + > +static int __sev_do_cmd_locked(int cmd, void *data, int *psp_ret); > +static int sev_do_cmd(int cmd, void *data, int *psp_ret); > + > static inline bool sev_version_greater_or_equal(u8 maj, u8 min) > { > struct sev_device *sev = psp_master->sev_data; > @@ -177,11 +185,161 @@ static int sev_cmd_buffer_len(int cmd) > return 0; > } > > +static void snp_leak_pages(unsigned long pfn, unsigned int npages) > +{ > + WARN(1, "psc failed, pfn 0x%lx pages %d (leaking)\n", pfn, npages); > + while (npages--) { > + memory_failure(pfn, 0); > + dump_rmpentry(pfn); > + pfn++; > + } > +} > + > +static int snp_reclaim_pages(unsigned long pfn, unsigned int npages, bool locked) > +{ > + struct sev_data_snp_page_reclaim data; > + int ret, err, i, n = 0; > + > + for (i = 0; i < npages; i++) { > + memset(&data, 0, sizeof(data)); > + data.paddr = pfn << PAGE_SHIFT; > + > + if (locked) > + ret = __sev_do_cmd_locked(SEV_CMD_SNP_PAGE_RECLAIM, &data, &err); > + else > + ret = sev_do_cmd(SEV_CMD_SNP_PAGE_RECLAIM, &data, &err); > + if (ret) > + goto cleanup; > + > + ret = rmp_make_shared(pfn, PG_LEVEL_4K); > + if (ret) > + goto cleanup; > + > + pfn++; > + n++; > + } > + > + return 0; > + > +cleanup: > + /* > + * If failed to reclaim the page then page is no longer safe to > + * be released, leak it. > + */ > + snp_leak_pages(pfn, npages - n); > + return ret; > +} > + > +static inline int rmp_make_firmware(unsigned long pfn, int level) > +{ > + return rmp_make_private(pfn, 0, level, 0, true); > +} > + > +static int snp_set_rmp_state(unsigned long paddr, unsigned int npages, bool to_fw, bool locked, > + bool need_reclaim) > +{ > + unsigned long pfn = __sme_clr(paddr) >> PAGE_SHIFT; /* Cbit maybe set in the paddr */ > + int rc, n = 0, i; > + > + for (i = 0; i < npages; i++) { > + if (to_fw) > + rc = rmp_make_firmware(pfn, PG_LEVEL_4K); > + else > + rc = need_reclaim ? snp_reclaim_pages(pfn, 1, locked) : > + rmp_make_shared(pfn, PG_LEVEL_4K); > + if (rc) > + goto cleanup; > + > + pfn++; > + n++; > + } > + > + return 0; > + > +cleanup: > + /* Try unrolling the firmware state changes */ > + if (to_fw) { > + /* > + * Reclaim the pages which were already changed to the > + * firmware state. > + */ > + snp_reclaim_pages(paddr >> PAGE_SHIFT, n, locked); > + > + return rc; > + } > + > + /* > + * If failed to change the page state to shared, then its not safe > + * to release the page back to the system, leak it. > + */ > + snp_leak_pages(pfn, npages - n); > + > + return rc; > +} > + > +static struct page *__snp_alloc_firmware_pages(gfp_t gfp_mask, int order, bool locked) > +{ > + unsigned long npages = 1ul << order, paddr; > + struct sev_device *sev; > + struct page *page; > + > + if (!psp_master || !psp_master->sev_data) > + return NULL; > + > + page = alloc_pages(gfp_mask, order); > + if (!page) > + return NULL; > + > + /* If SEV-SNP is initialized then add the page in RMP table. */ > + sev = psp_master->sev_data; > + if (!sev->snp_inited) > + return page; > + > + paddr = __pa((unsigned long)page_address(page)); > + if (snp_set_rmp_state(paddr, npages, true, locked, false)) > + return NULL; > + > + return page; > +} > + > +void *snp_alloc_firmware_page(gfp_t gfp_mask) > +{ > + struct page *page; > + > + page = __snp_alloc_firmware_pages(gfp_mask, 0, false); Could be just struct page *page == __snp_alloc_firmware_pages(gfp_mask, 0, false); > + > + return page ? page_address(page) : NULL; > +} > +EXPORT_SYMBOL_GPL(snp_alloc_firmware_page); Undocumented API Why don't you just export __snp_alloc_firmware_pages() and declare these trivial wrappers as "static inline" inside psp-sev.h? > + > +static void __snp_free_firmware_pages(struct page *page, int order, bool locked) > +{ > + unsigned long paddr, npages = 1ul << order; > + > + if (!page) > + return; Silently ignored NULL pointer. > + > + paddr = __pa((unsigned long)page_address(page)); > + if (snp_set_rmp_state(paddr, npages, false, locked, true)) > + return; > + > + __free_pages(page, order); > +} > + > +void snp_free_firmware_page(void *addr) > +{ > + if (!addr) > + return; Why silently ignore a NULL pointer? At minimum, pr_warn() would be appropriate. > + > + __snp_free_firmware_pages(virt_to_page(addr), 0, false); > +} > +EXPORT_SYMBOL(snp_free_firmware_page); Ditto, same comments as for allocation part. > + > static void *sev_fw_alloc(unsigned long len) > { > struct page *page; > > - page = alloc_pages(GFP_KERNEL, get_order(len)); > + page = __snp_alloc_firmware_pages(GFP_KERNEL, get_order(len), false); > if (!page) > return NULL; > > @@ -393,7 +551,7 @@ static int __sev_init_locked(int *error) > data.tmr_address = __pa(sev_es_tmr); > > data.flags |= SEV_INIT_FLAGS_SEV_ES; > - data.tmr_len = SEV_ES_TMR_SIZE; > + data.tmr_len = sev_es_tmr_size; > } > > return __sev_do_cmd_locked(SEV_CMD_INIT, &data, error); > @@ -421,7 +579,7 @@ static int __sev_init_ex_locked(int *error) > data.tmr_address = __pa(sev_es_tmr); > > data.flags |= SEV_INIT_FLAGS_SEV_ES; > - data.tmr_len = SEV_ES_TMR_SIZE; > + data.tmr_len = sev_es_tmr_size; > } > > return __sev_do_cmd_locked(SEV_CMD_INIT_EX, &data, error); > @@ -818,6 +976,8 @@ static int __sev_snp_init_locked(int *error) > sev->snp_inited = true; > dev_dbg(sev->dev, "SEV-SNP firmware initialized\n"); > > + sev_es_tmr_size = SEV_SNP_ES_TMR_SIZE; > + > return rc; > } > > @@ -1341,8 +1501,9 @@ static void sev_firmware_shutdown(struct sev_device *sev) > /* The TMR area was encrypted, flush it from the cache */ > wbinvd_on_all_cpus(); > > - free_pages((unsigned long)sev_es_tmr, > - get_order(SEV_ES_TMR_SIZE)); > + __snp_free_firmware_pages(virt_to_page(sev_es_tmr), > + get_order(sev_es_tmr_size), > + false); > sev_es_tmr = NULL; > } > > @@ -1430,7 +1591,7 @@ void sev_pci_init(void) > } > > /* Obtain the TMR memory area for SEV-ES use */ > - sev_es_tmr = sev_fw_alloc(SEV_ES_TMR_SIZE); > + sev_es_tmr = sev_fw_alloc(sev_es_tmr_size); > if (!sev_es_tmr) > dev_warn(sev->dev, > "SEV: TMR allocation failed, SEV-ES support unavailable\n"); > diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h > index 9f921d221b75..a3bb792bb842 100644 > --- a/include/linux/psp-sev.h > +++ b/include/linux/psp-sev.h > @@ -12,6 +12,8 @@ > #ifndef __PSP_SEV_H__ > #define __PSP_SEV_H__ > > +#include <linux/sev.h> > + > #include <uapi/linux/psp-sev.h> > > #ifdef CONFIG_X86 > @@ -940,6 +942,8 @@ int snp_guest_page_reclaim(struct sev_data_snp_page_reclaim *data, int *error); > int snp_guest_dbg_decrypt(struct sev_data_snp_dbg *data, int *error); > > void *psp_copy_user_blob(u64 uaddr, u32 len); > +void *snp_alloc_firmware_page(gfp_t mask); > +void snp_free_firmware_page(void *addr); > > #else /* !CONFIG_CRYPTO_DEV_SP_PSP */ > > @@ -981,6 +985,13 @@ static inline int snp_guest_dbg_decrypt(struct sev_data_snp_dbg *data, int *erro > return -ENODEV; > } > > +static inline void *snp_alloc_firmware_page(gfp_t mask) > +{ > + return NULL; > +} > + > +static inline void snp_free_firmware_page(void *addr) { } > + > #endif /* CONFIG_CRYPTO_DEV_SP_PSP */ > > #endif /* __PSP_SEV_H__ */ > -- > 2.25.1 > BR, Jarkko
On Tue, Jun 21, 2022 at 08:17:15PM +0000, Kalra, Ashish wrote: > [Public] > > Hello Peter, > > >> +static int snp_reclaim_pages(unsigned long pfn, unsigned int npages, > >> +bool locked) { > >> + struct sev_data_snp_page_reclaim data; > >> + int ret, err, i, n = 0; > >> + > >> + for (i = 0; i < npages; i++) { > > >What about setting |n| here too, also the other increments. > > >for (i = 0, n = 0; i < npages; i++, n++, pfn++) > > Yes that is simpler. > > >> + memset(&data, 0, sizeof(data)); > >> + data.paddr = pfn << PAGE_SHIFT; > >> + > >> + if (locked) > >> + ret = __sev_do_cmd_locked(SEV_CMD_SNP_PAGE_RECLAIM, &data, &err); > >> + else > >> + ret = sev_do_cmd(SEV_CMD_SNP_PAGE_RECLAIM, > >> + &data, &err); > > > Can we change `sev_cmd_mutex` to some sort of nesting lock type? That could clean up this if (locked) code. > > > +static inline int rmp_make_firmware(unsigned long pfn, int level) { > > + return rmp_make_private(pfn, 0, level, 0, true); } > > + > > +static int snp_set_rmp_state(unsigned long paddr, unsigned int npages, bool to_fw, bool locked, > > + bool need_reclaim) > > >This function can do a lot and when I read the call sites its hard to see what its doing since we have a combination of arguments which tell us what behavior is happening, some of which are not valid (ex: to_fw == true and need_reclaim == true is an >invalid argument combination). > > to_fw is used to make a firmware page and need_reclaim is for freeing the firmware page, so they are going to be mutually exclusive. > > I actually can connect with it quite logically with the callers : > snp_alloc_firmware_pages will call with to_fw = true and need_reclaim = false > and snp_free_firmware_pages will do the opposite, to_fw = false and need_reclaim = true. > > That seems straightforward to look at. > > >Also this for loop over |npages| is duplicated from snp_reclaim_pages(). One improvement here is that on the current > >snp_reclaim_pages() if we fail to reclaim a page we assume we cannot reclaim the next pages, this may cause us to snp_leak_pages() more pages than we actually need too. > > Yes that is true. > > >What about something like this? > > >static snp_leak_page(u64 pfn, enum pg_level level) { > > memory_failure(pfn, 0); > > dump_rmpentry(pfn); > >} > > >static int snp_reclaim_page(u64 pfn, enum pg_level level) { > > int ret; > > struct sev_data_snp_page_reclaim data; > > > ret = sev_do_cmd(SEV_CMD_SNP_PAGE_RECLAIM, &data, &err); > > if (ret) > > goto cleanup; > > > ret = rmp_make_shared(pfn, level); > > if (ret) > > goto cleanup; > > > return 0; > > >cleanup: > > snp_leak_page(pfn, level) > >} > > >typedef int (*rmp_state_change_func) (u64 pfn, enum pg_level level); > > >static int snp_set_rmp_state(unsigned long paddr, unsigned int npages, rmp_state_change_func state_change, rmp_state_change_func cleanup) { > > struct sev_data_snp_page_reclaim data; > > int ret, err, i, n = 0; > > > for (i = 0, n = 0; i < npages; i++, n++, pfn++) { > > ret = state_change(pfn, PG_LEVEL_4K) > > if (ret) > > goto cleanup; > > } > > > return 0; > > > cleanup: > > for (; i>= 0; i--, n--, pfn--) { > > cleanup(pfn, PG_LEVEL_4K); > > } > > > return ret; > >} > > >Then inside of __snp_alloc_firmware_pages(): > > >snp_set_rmp_state(paddr, npages, rmp_make_firmware, snp_reclaim_page); > > >And inside of __snp_free_firmware_pages(): > > >snp_set_rmp_state(paddr, npages, snp_reclaim_page, snp_leak_page); > > >Just a suggestion feel free to ignore. The readability comment could be addressed much less invasively by just making separate functions for each valid combination of arguments here. Like snp_set_rmp_fw_state(), snp_set_rmp_shared_state(), > >snp_set_rmp_release_state() or something. > > >> +static struct page *__snp_alloc_firmware_pages(gfp_t gfp_mask, int > >> +order, bool locked) { > >> + unsigned long npages = 1ul << order, paddr; > >> + struct sev_device *sev; > >> + struct page *page; > >> + > >> + if (!psp_master || !psp_master->sev_data) > >> + return NULL; > >> + > >> + page = alloc_pages(gfp_mask, order); > >> + if (!page) > >> + return NULL; > >> + > >> + /* If SEV-SNP is initialized then add the page in RMP table. */ > >> + sev = psp_master->sev_data; > >> + if (!sev->snp_inited) > >> + return page; > >> + > >> + paddr = __pa((unsigned long)page_address(page)); > >> + if (snp_set_rmp_state(paddr, npages, true, locked, false)) > >> + return NULL; > > >So what about the case where snp_set_rmp_state() fails but we were able to reclaim all the pages? Should we be able to signal that to callers so that we could free |page| here? But given this is an error path already maybe we can optimize this in a >follow up series. > > Yes, we should actually tie in to snp_reclaim_pages() success or failure here in the case we were able to successfully unroll some or all of the firmware state change. > > > + > > + return page; > > +} > > + > > +void *snp_alloc_firmware_page(gfp_t gfp_mask) { > > + struct page *page; > > + > > + page = __snp_alloc_firmware_pages(gfp_mask, 0, false); > > + > > + return page ? page_address(page) : NULL; } > > +EXPORT_SYMBOL_GPL(snp_alloc_firmware_page); > > + > > +static void __snp_free_firmware_pages(struct page *page, int order, > > +bool locked) { > > + unsigned long paddr, npages = 1ul << order; > > + > > + if (!page) > > + return; > > + > > + paddr = __pa((unsigned long)page_address(page)); > > + if (snp_set_rmp_state(paddr, npages, false, locked, true)) > > + return; > > > Here we may be able to free some of |page| depending how where inside of snp_set_rmp_state() we failed. But again given this is an error path already maybe we can optimize this in a follow up series. > > Yes, we probably should be able to free some of the page(s) depending on how many page(s) got reclaimed in snp_set_rmp_state(). > But these reclamation failures may not be very common, so any failure is indicative of a bigger issue, it might be the case when there is a single page reclamation error it might happen with all the subsequent > pages and so follow a simple recovery procedure, then handling a more complex recovery for a chunk of pages being reclaimed and another chunk not. Silent ignore is stil a bad idea. I.e. at minimum would make sense to print a warning to klog. > > Thanks, > Ashish BR, Jarkko
On Mon, Jun 20, 2022 at 11:05:01PM +0000, Ashish Kalra wrote: > +static void snp_leak_pages(unsigned long pfn, unsigned int npages) That function name looks wrong. > +{ > + WARN(1, "psc failed, pfn 0x%lx pages %d (leaking)\n", pfn, npages); > + while (npages--) { > + memory_failure(pfn, 0); ^^^^^^^^^^^^^^^^^^^^^^ Why? * This function is called by the low level machine check code * of an architecture when it detects hardware memory corruption * of a page. It tries its best to recover, which includes * dropping pages, killing processes etc. I don't think you wanna do that. It looks like you want to prevent the page from being used again but not mark it as PG_hwpoison and whatnot. PG_reserved perhaps? > + dump_rmpentry(pfn); > + pfn++; > + } > +} > + > +static int snp_reclaim_pages(unsigned long pfn, unsigned int npages, bool locked) > +{ > + struct sev_data_snp_page_reclaim data; > + int ret, err, i, n = 0; > + > + for (i = 0; i < npages; i++) { > + memset(&data, 0, sizeof(data)); > + data.paddr = pfn << PAGE_SHIFT; Oh wow, this is just silly. A struct for a single u64. Just use a u64 paddr; directly. But we had this topic already... > + > + if (locked) Ew, that's never a good design - conditional locking. > + ret = __sev_do_cmd_locked(SEV_CMD_SNP_PAGE_RECLAIM, &data, &err); > + else > + ret = sev_do_cmd(SEV_CMD_SNP_PAGE_RECLAIM, &data, &err); <---- newline here. > + if (ret) > + goto cleanup; > + > + ret = rmp_make_shared(pfn, PG_LEVEL_4K); > + if (ret) > + goto cleanup; > + > + pfn++; > + n++; > + } > + > + return 0; > + > +cleanup: > + /* > + * If failed to reclaim the page then page is no longer safe to > + * be released, leak it. > + */ > + snp_leak_pages(pfn, npages - n); So this looks real weird: we go and reclaim pages, we hit an error during reclaiming a page X somewhere in-between and then we go and mark the *remaining* pages as not to be used?! Why? Why not only that *one* page which failed and then we continue with the rest?! > + return ret; > +} > + > +static inline int rmp_make_firmware(unsigned long pfn, int level) > +{ > + return rmp_make_private(pfn, 0, level, 0, true); > +} That's a silly wrapper used only once. Just do at the callsite: /* Mark this page as belonging to firmware */ rc = rmp_make_private(pfn, 0, level, 0, true); > + > +static int snp_set_rmp_state(unsigned long paddr, unsigned int npages, bool to_fw, bool locked, > + bool need_reclaim) Tangential to the above, this is just nuts with those bool arguments. Just look at the callsites: do you understand what they do? snp_set_rmp_state(paddr, npages, true, locked, false); what does that do? You need to go up to the definition of the function, count the arguments and see what that "true" arg stands for. What you should do instead is, have separate helpers which do only one thing: rmp_mark_pages_firmware(); rmp_mark_pages_shared(); rmp_mark_pages_... and then have the *callers* issue snp_reclaim_pages() when needed. So you'd have rmp_mark_pages_firmware(); rmp_mark_pages_shared() and __snp_free_firmware_pages() would do rmp_mark_pages_shared(); snp_reclaim_pages(); and so on. And then if you need locking, the callers can decide which sev_do_cmd variant to issue. And then if you have common code fragments which you can unify into a bigger helper function, *then* you can do that. Instead of multiplexing it this way. Which makes it really hard to follow what the code does. > + unsigned long pfn = __sme_clr(paddr) >> PAGE_SHIFT; /* Cbit maybe set in the paddr */ No side comments pls. > + int rc, n = 0, i; > + > + for (i = 0; i < npages; i++) { > + if (to_fw) > + rc = rmp_make_firmware(pfn, PG_LEVEL_4K); > + else > + rc = need_reclaim ? snp_reclaim_pages(pfn, 1, locked) : > + rmp_make_shared(pfn, PG_LEVEL_4K); > + if (rc) > + goto cleanup; > + > + pfn++; > + n++; > + } > + > + return 0; > + > +cleanup: > + /* Try unrolling the firmware state changes */ > + if (to_fw) { > + /* > + * Reclaim the pages which were already changed to the > + * firmware state. > + */ > + snp_reclaim_pages(paddr >> PAGE_SHIFT, n, locked); > + > + return rc; > + } > + > + /* > + * If failed to change the page state to shared, then its not safe > + * to release the page back to the system, leak it. > + */ > + snp_leak_pages(pfn, npages - n); > + > + return rc; > +} ... > +void snp_free_firmware_page(void *addr) > +{ > + if (!addr) > + return; > + > + __snp_free_firmware_pages(virt_to_page(addr), 0, false); > +} > +EXPORT_SYMBOL(snp_free_firmware_page); EXPORT_SYMBOL_GPL() ofc.
Hello Boris, On 10/13/2022 10:15 AM, Borislav Petkov wrote: > On Mon, Jun 20, 2022 at 11:05:01PM +0000, Ashish Kalra wrote: >> +static void snp_leak_pages(unsigned long pfn, unsigned int npages) > > That function name looks wrong. > >> +{ >> + WARN(1, "psc failed, pfn 0x%lx pages %d (leaking)\n", pfn, npages); >> + while (npages--) { >> + memory_failure(pfn, 0); > ^^^^^^^^^^^^^^^^^^^^^^ > > Why? The page was in FW state and we couldn't transition it back to HV/Shared state, any access to this page can cause RMP #PF. > > * This function is called by the low level machine check code > * of an architecture when it detects hardware memory corruption > * of a page. It tries its best to recover, which includes > * dropping pages, killing processes etc. > > I don't think you wanna do that. > > It looks like you want to prevent the page from being used again but not > mark it as PG_hwpoison and whatnot. PG_reserved perhaps? * PG_reserved is set for special pages. The "struct page" of such a * page should in general not be touched (e.g. set dirty) except by its * owner. If it is "still" accessed/touched then it can cause RMP #PF. On the other hand, * PG_hwpoison... Accessing is * not safe since it may cause another machine check. Don't touch! That sounds exactly the state we want these page(s) to be in ? Another possibility is PG_error. > >> + dump_rmpentry(pfn); >> + pfn++; >> + } >> +} >> + >> +static int snp_reclaim_pages(unsigned long pfn, unsigned int npages, bool locked) >> +{ >> + struct sev_data_snp_page_reclaim data; >> + int ret, err, i, n = 0; >> + >> + for (i = 0; i < npages; i++) { >> + memset(&data, 0, sizeof(data)); >> + data.paddr = pfn << PAGE_SHIFT; > > Oh wow, this is just silly. A struct for a single u64. Just use a > > u64 paddr; Ok. > > directly. But we had this topic already... > >> + >> + if (locked) > > Ew, that's never a good design - conditional locking. There is a previous suggestion to change `sev_cmd_mutex` to some sort of nesting lock type to clean up this if (locked) code, though AFAIK, there is no support for nesting lock types. > >> + ret = __sev_do_cmd_locked(SEV_CMD_SNP_PAGE_RECLAIM, &data, &err); >> + else >> + ret = sev_do_cmd(SEV_CMD_SNP_PAGE_RECLAIM, &data, &err); > > <---- newline here. > >> + if (ret) >> + goto cleanup; >> + >> + ret = rmp_make_shared(pfn, PG_LEVEL_4K); >> + if (ret) >> + goto cleanup; >> + >> + pfn++; >> + n++; >> + } >> + >> + return 0; >> + >> +cleanup: >> + /* >> + * If failed to reclaim the page then page is no longer safe to >> + * be released, leak it. >> + */ >> + snp_leak_pages(pfn, npages - n); > > So this looks real weird: we go and reclaim pages, we hit an error > during reclaiming a page X somewhere in-between and then we go and mark > the *remaining* pages as not to be used?! > > Why? > > Why not only that *one* page which failed and then we continue with the > rest?! I agree and will change to this approach. > >> + return ret; >> +} >> + >> +static inline int rmp_make_firmware(unsigned long pfn, int level) >> +{ >> + return rmp_make_private(pfn, 0, level, 0, true); >> +} > > That's a silly wrapper used only once. Just do at the callsite: > > /* Mark this page as belonging to firmware */ > rc = rmp_make_private(pfn, 0, level, 0, true); > Ok. >> + >> +static int snp_set_rmp_state(unsigned long paddr, unsigned int npages, bool to_fw, bool locked, >> + bool need_reclaim) > > Tangential to the above, this is just nuts with those bool arguments. > Just look at the callsites: do you understand what they do? > > snp_set_rmp_state(paddr, npages, true, locked, false); > > what does that do? You need to go up to the definition of the function, > count the arguments and see what that "true" arg stands for. I totally agree, this is simply unreadable. And this has been mentioned previously too ... This function can do a lot and when I read the call sites its hard to see what its doing since we have a combination of arguments which tell us what behavior is happening ... > > What you should do instead is, have separate helpers which do only one > thing: > > rmp_mark_pages_firmware(); > rmp_mark_pages_shared(); > rmp_mark_pages_... > > and then have the *callers* issue snp_reclaim_pages() when needed. So you'd have > > rmp_mark_pages_firmware(); > rmp_mark_pages_shared() > > and __snp_free_firmware_pages() would do > > rmp_mark_pages_shared(); > snp_reclaim_pages(); > Actually, this only needs to call snp_reclaim_pages(). > and so on. > > And then if you need locking, the callers can decide which sev_do_cmd > variant to issue. > > And then if you have common code fragments which you can unify into a > bigger helper function, *then* you can do that. > > Instead of multiplexing it this way. Which makes it really hard to > follow what the code does. > Sure i will do this cleanup. > >> + unsigned long pfn = __sme_clr(paddr) >> PAGE_SHIFT; /* Cbit maybe set in the paddr */ > > No side comments pls. > >> + int rc, n = 0, i; >> + >> + for (i = 0; i < npages; i++) { >> + if (to_fw) >> + rc = rmp_make_firmware(pfn, PG_LEVEL_4K); >> + else >> + rc = need_reclaim ? snp_reclaim_pages(pfn, 1, locked) : >> + rmp_make_shared(pfn, PG_LEVEL_4K); >> + if (rc) >> + goto cleanup; >> + >> + pfn++; >> + n++; >> + } >> + >> + return 0; >> + >> +cleanup: >> + /* Try unrolling the firmware state changes */ >> + if (to_fw) { >> + /* >> + * Reclaim the pages which were already changed to the >> + * firmware state. >> + */ >> + snp_reclaim_pages(paddr >> PAGE_SHIFT, n, locked); >> + >> + return rc; >> + } >> + >> + /* >> + * If failed to change the page state to shared, then its not safe >> + * to release the page back to the system, leak it. >> + */ >> + snp_leak_pages(pfn, npages - n); >> + >> + return rc; >> +} > > ... > >> +void snp_free_firmware_page(void *addr) >> +{ >> + if (!addr) >> + return; >> + >> + __snp_free_firmware_pages(virt_to_page(addr), 0, false); >> +} >> +EXPORT_SYMBOL(snp_free_firmware_page); > > EXPORT_SYMBOL_GPL() ofc. > Thanks, Ashish
On Fri, Oct 14, 2022 at 03:00:09PM -0500, Kalra, Ashish wrote: > If it is "still" accessed/touched then it can cause RMP #PF. > On the other hand, > > * PG_hwpoison... Accessing is > * not safe since it may cause another machine check. Don't touch! > > That sounds exactly the state we want these page(s) to be in ? > > Another possibility is PG_error. Something like this: diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index e66f7aa3191d..baffa9c0dc30 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -186,6 +186,7 @@ enum pageflags { * THP. */ PG_has_hwpoisoned = PG_error, + PG_offlimits = PG_hwpoison, #endif /* non-lru isolated movable page */ and SNP will have to depend on CONFIG_MEMORY_FAILURE. But I'd let mm folks correct me here on the details.
Just to add here, writing to any of these pages from the Host will trigger a RMP #PF which will cause the RMP page fault handler to send a SIGBUS to the current process, as this page is not owned by Host. So calling memory_failure() is proactively doing the same, marking the page as poisoned and probably also killing the current process. Thanks, Ashish On 10/25/2022 5:25 AM, Borislav Petkov wrote: > On Fri, Oct 14, 2022 at 03:00:09PM -0500, Kalra, Ashish wrote: >> If it is "still" accessed/touched then it can cause RMP #PF. >> On the other hand, >> >> * PG_hwpoison... Accessing is >> * not safe since it may cause another machine check. Don't touch! >> >> That sounds exactly the state we want these page(s) to be in ? >> >> Another possibility is PG_error. > > Something like this: > > diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h > index e66f7aa3191d..baffa9c0dc30 100644 > --- a/include/linux/page-flags.h > +++ b/include/linux/page-flags.h > @@ -186,6 +186,7 @@ enum pageflags { > * THP. > */ > PG_has_hwpoisoned = PG_error, > + PG_offlimits = PG_hwpoison, > #endif > > /* non-lru isolated movable page */ > > and SNP will have to depend on CONFIG_MEMORY_FAILURE. > > But I'd let mm folks correct me here on the details. >
On Mon, Oct 31, 2022 at 03:10:16PM -0500, Kalra, Ashish wrote: > Just to add here, writing to any of these pages from the Host > will trigger a RMP #PF which will cause the RMP page fault handler > to send a SIGBUS to the current process, as this page is not owned > by Host. And kill the host process? So this is another "policy" which sounds iffy. If we kill the process, we should at least say why. Are we doing that currently? > So calling memory_failure() is proactively doing the same, marking the > page as poisoned and probably also killing the current process. But the page is not suffering a memory failure - it cannot be reclaimed for whatever reason. Btw, how can that reclaim failure ever happen? Any real scenarios? Anyway, memory failure just happens to fit what you wanna do but you can't just reuse that - that's hacky. What is the problem with writing your own function which does that? Also, btw, please do not top-post. Thx.
Hello Boris, On 10/31/2022 4:15 PM, Borislav Petkov wrote: > On Mon, Oct 31, 2022 at 03:10:16PM -0500, Kalra, Ashish wrote: >> Just to add here, writing to any of these pages from the Host >> will trigger a RMP #PF which will cause the RMP page fault handler >> to send a SIGBUS to the current process, as this page is not owned >> by Host. > > And kill the host process? > > So this is another "policy" which sounds iffy. If we kill the process, > we should at least say why. Are we doing that currently? Yes, pasted below is the latest host RMP #PF handler, with new and additional comments added and there is a relevant comment added here for this behavior: static int handle_user_rmp_page_fault(struct pt_regs *regs, unsigned long error_code,unsigned long address) { ... ... /* * If its a guest private page, then the fault cannot be resolved. * Send a SIGBUS to terminate the process. * * As documented in APM vol3 pseudo-code for RMPUPDATE, when the * 2M range is covered by a valid (Assigned=1) 2M entry, the middle * 511 4k entries also have Assigned=1. This means that if there is * an access to a page which happens to lie within an Assigned 2M * entry, the 4k RMP entry will also have Assigned=1. Therefore, the * kernel should see that the page is not a valid page and the fault * cannot be resolved. */ if (snp_lookup_rmpentry(pfn, &rmp_level)) { do_sigbus(regs, error_code, address, VM_FAULT_SIGBUS); return RMP_PF_RETRY; } ... ... I believe that we already had an off-list discussion on the same, copying David Kaplan's reply on the same below: So what I think you want to do is: 1. Compute the pfn for the 4kb page you're trying to access (as your code below does) 2. Read that RMP entry -- If it is assigned then kill the process 3. Otherwise, check the level from the host page table. If level=PG_LEVEL_4K then somebody else may have already smashed this page, so just retry the instruction 4. If level=PG_LEVEL_2M/1G, then the host needs to split their page. This is the current algorithm being followed by the host RMP #PF handler. > >> So calling memory_failure() is proactively doing the same, marking the >> page as poisoned and probably also killing the current process. > > But the page is not suffering a memory failure - it cannot be reclaimed > for whatever reason. Btw, how can that reclaim failure ever happen? Any > real scenarios? The scenarios here are either SNP FW failure (SNP_PAGE_RECLAIM command) in transitioning the page back to HV state and/or RMPUPDATE instruction failure to transition the page back to hypervisor/shared state. > > Anyway, memory failure just happens to fit what you wanna do but you > can't just reuse that - that's hacky. What is the problem with writing > your own function which does that? > Ok. Will look at adding our own recovery function for the same, but that will again mark the pages as poisoned, right ? Still waiting for some/more feedback from mm folks on the same. Thanks, Ashish > Also, btw, please do not top-post. > > Thx. >
On Mon, Oct 31, 2022 at 04:58:38PM -0500, Kalra, Ashish wrote: > if (snp_lookup_rmpentry(pfn, &rmp_level)) { > do_sigbus(regs, error_code, address, VM_FAULT_SIGBUS); > return RMP_PF_RETRY; Does this issue some halfway understandable error message why the process got killed? > Will look at adding our own recovery function for the same, but that will > again mark the pages as poisoned, right ? Well, not poisoned but PG_offlimits or whatever the mm folks agree upon. Semantically, it'll be handled the same way, ofc. > Still waiting for some/more feedback from mm folks on the same. Just send the patch and they'll give it. Thx.
Hello Boris, On 11/2/2022 6:22 AM, Borislav Petkov wrote: > On Mon, Oct 31, 2022 at 04:58:38PM -0500, Kalra, Ashish wrote: >> if (snp_lookup_rmpentry(pfn, &rmp_level)) { >> do_sigbus(regs, error_code, address, VM_FAULT_SIGBUS); >> return RMP_PF_RETRY; > > Does this issue some halfway understandable error message why the > process got killed? > >> Will look at adding our own recovery function for the same, but that will >> again mark the pages as poisoned, right ? > > Well, not poisoned but PG_offlimits or whatever the mm folks agree upon. > Semantically, it'll be handled the same way, ofc. Added a new PG_offlimits flag and a simple corresponding handler for it. But there is still added complexity of handling hugepages as part of reclamation failures (both HugeTLB and transparent hugepages) and that means calling more static functions in mm/memory_failure.c There is probably a more appropriate handler in mm/memory-failure.c: soft_offline_page() - this will mark the page as HWPoisoned and also has handling for hugepages. And we can avoid adding a new page flag too. soft_offline_page - Soft offline a page. Soft offline a page, by migration or invalidation, without killing anything. So, this looks like a good option to call soft_offline_page() instead of memory_failure() in case of failure to transition the page back to HV/shared state via SNP_RECLAIM_CMD and/or RMPUPDATE instruction. Thanks, Ashish > >> Still waiting for some/more feedback from mm folks on the same. > > Just send the patch and they'll give it. > > Thx. >
On Mon, Nov 14, 2022 at 05:36:29PM -0600, Kalra, Ashish wrote: > But there is still added complexity of handling hugepages as part of > reclamation failures (both HugeTLB and transparent hugepages) and that Why? You want to offline pfns of 4K pages. What hugepages? > means calling more static functions in mm/memory_failure.c > > There is probably a more appropriate handler in mm/memory-failure.c: > > soft_offline_page() - this will mark the page as HWPoisoned and also has > handling for hugepages. And we can avoid adding a new page flag too. So if some other code wants to dump the amount of all hwpoisoned pages, it'll dump those too. Don't you see what is wrong with this picture? And btw, reusing the hwpoison flag PG_offlimits = PG_hwpoison like previously suggested doesn't help here either. IOW, I really don't like this lumping of semantics together. ;-\
Cc'ing memory failure folks, the beinning of this subthread is here: https://lore.kernel.org/all/3a51840f6a80c87b39632dc728dbd9b5dd444cd7.1655761627.git.ashish.kalra@amd.com/ On 11/15/22 00:36, Kalra, Ashish wrote: > Hello Boris, > > On 11/2/2022 6:22 AM, Borislav Petkov wrote: >> On Mon, Oct 31, 2022 at 04:58:38PM -0500, Kalra, Ashish wrote: >>> if (snp_lookup_rmpentry(pfn, &rmp_level)) { >>> do_sigbus(regs, error_code, address, VM_FAULT_SIGBUS); >>> return RMP_PF_RETRY; >> >> Does this issue some halfway understandable error message why the >> process got killed? >> >>> Will look at adding our own recovery function for the same, but that will >>> again mark the pages as poisoned, right ? >> >> Well, not poisoned but PG_offlimits or whatever the mm folks agree upon. >> Semantically, it'll be handled the same way, ofc. > > Added a new PG_offlimits flag and a simple corresponding handler for it. One thing is, there's not enough page flags to be adding more (except aliases for existing) for cases that can avoid it, but as Boris says, if using alias to PG_hwpoison it depends what will become confused with the actual hwpoison. > But there is still added complexity of handling hugepages as part of > reclamation failures (both HugeTLB and transparent hugepages) and that > means calling more static functions in mm/memory_failure.c > > There is probably a more appropriate handler in mm/memory-failure.c: > > soft_offline_page() - this will mark the page as HWPoisoned and also has > handling for hugepages. And we can avoid adding a new page flag too. > > soft_offline_page - Soft offline a page. > Soft offline a page, by migration or invalidation, without killing anything. > > So, this looks like a good option to call > soft_offline_page() instead of memory_failure() in case of > failure to transition the page back to HV/shared state via SNP_RECLAIM_CMD > and/or RMPUPDATE instruction. So it's a bit unclear to me what exact situation we are handling here. The original patch here seems to me to be just leaking back pages that are unsafe for further use. soft_offline_page() seems to fit that scenario of a graceful leak before something is irrepairably corrupt and we page fault on it. But then in the thread you discus PF handling and killing. So what is the case here? If we detect this need to call snp_leak_pages() does it mean: a) nobody that could page fault at them (the guest?) is running anymore, we are tearing it down, we just can't reuse the pages further on the host - seem like soft_offline_page() could work, but maybe we could just put the pages on some leaked lists without special page? The only thing that should matter is not to free the pages to the page allocator so they would be reused by something else. b) something can stil page fault at them (what?) - AFAIU can't be resolved without killing something, memory_failure() might limit the damage > Thanks, > Ashish > >> >>> Still waiting for some/more feedback from mm folks on the same. >> >> Just send the patch and they'll give it. >> >> Thx. >>
On Tue, Nov 15, 2022 at 04:14:42PM +0100, Vlastimil Babka wrote: > but maybe we could just put the pages on some leaked lists without > special page? The only thing that should matter is not to free the > pages to the page allocator so they would be reused by something else. As said on IRC, I like this a *lot*. This perfectly represents what those leaked pages are: leaked, cannot be used and lost. Certainly not hwpoisoned. Yeah, that's much better. Thx.
And, as dhansen connected the dots, this should be the exact same protection scenario as UPM: https://lore.kernel.org/all/20221025151344.3784230-1-chao.p.peng@linux.intel.com so you should be able to mark them inaccessible the same way and you won't need any poisoning dance. And Michael has patches so you probably should talk to him... Thx.
Hello Vlastimil, On 11/15/2022 9:14 AM, Vlastimil Babka wrote: > Cc'ing memory failure folks, the beinning of this subthread is here: > > https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flore.kernel.org%2Fall%2F3a51840f6a80c87b39632dc728dbd9b5dd444cd7.1655761627.git.ashish.kalra%40amd.com%2F&data=05%7C01%7Cashish.kalra%40amd.com%7C944b59f239c541a52ac808dac71c2089%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C638041220947600149%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&sdata=do9zzyMlAErkKx5rguqnL2GoG4lhsWHDI74zgwLWaZU%3D&reserved=0 > > On 11/15/22 00:36, Kalra, Ashish wrote: >> Hello Boris, >> >> On 11/2/2022 6:22 AM, Borislav Petkov wrote: >>> On Mon, Oct 31, 2022 at 04:58:38PM -0500, Kalra, Ashish wrote: >>>> if (snp_lookup_rmpentry(pfn, &rmp_level)) { >>>> do_sigbus(regs, error_code, address, VM_FAULT_SIGBUS); >>>> return RMP_PF_RETRY; >>> >>> Does this issue some halfway understandable error message why the >>> process got killed? >>> >>>> Will look at adding our own recovery function for the same, but that will >>>> again mark the pages as poisoned, right ? >>> >>> Well, not poisoned but PG_offlimits or whatever the mm folks agree upon. >>> Semantically, it'll be handled the same way, ofc. >> >> Added a new PG_offlimits flag and a simple corresponding handler for it. > > One thing is, there's not enough page flags to be adding more (except > aliases for existing) for cases that can avoid it, but as Boris says, if > using alias to PG_hwpoison it depends what will become confused with the > actual hwpoison. > >> But there is still added complexity of handling hugepages as part of >> reclamation failures (both HugeTLB and transparent hugepages) and that >> means calling more static functions in mm/memory_failure.c >> >> There is probably a more appropriate handler in mm/memory-failure.c: >> >> soft_offline_page() - this will mark the page as HWPoisoned and also has >> handling for hugepages. And we can avoid adding a new page flag too. >> >> soft_offline_page - Soft offline a page. >> Soft offline a page, by migration or invalidation, without killing anything. >> >> So, this looks like a good option to call >> soft_offline_page() instead of memory_failure() in case of >> failure to transition the page back to HV/shared state via SNP_RECLAIM_CMD >> and/or RMPUPDATE instruction. > > So it's a bit unclear to me what exact situation we are handling here. The > original patch here seems to me to be just leaking back pages that are > unsafe for further use. soft_offline_page() seems to fit that scenario of a > graceful leak before something is irrepairably corrupt and we page fault on it. > But then in the thread you discus PF handling and killing. So what is the > case here? If we detect this need to call snp_leak_pages() does it mean: > > a) nobody that could page fault at them (the guest?) is running anymore, we > are tearing it down, we just can't reuse the pages further on the host The host can page fault on them, if anything on the host tries to write to these pages. Host reads will return garbage data. > - seem like soft_offline_page() could work, but maybe we could just put the > pages on some leaked lists without special page? The only thing that should > matter is not to free the pages to the page allocator so they would be > reused by something else. > > b) something can stil page fault at them (what?) - AFAIU can't be resolved > without killing something, memory_failure() might limit the damage As i mentioned above, host writes will cause RMP violation page fault. Thanks, Ashish >> >>> >>>> Still waiting for some/more feedback from mm folks on the same. >>> >>> Just send the patch and they'll give it. >>> >>> Thx. >>> >
On 11/15/2022 11:24 AM, Kalra, Ashish wrote: > Hello Vlastimil, > > On 11/15/2022 9:14 AM, Vlastimil Babka wrote: >> Cc'ing memory failure folks, the beinning of this subthread is here: >> >> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flore.kernel.org%2Fall%2F3a51840f6a80c87b39632dc728dbd9b5dd444cd7.1655761627.git.ashish.kalra%40amd.com%2F&data=05%7C01%7Cashish.kalra%40amd.com%7C944b59f239c541a52ac808dac71c2089%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C638041220947600149%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&sdata=do9zzyMlAErkKx5rguqnL2GoG4lhsWHDI74zgwLWaZU%3D&reserved=0 >> >> >> On 11/15/22 00:36, Kalra, Ashish wrote: >>> Hello Boris, >>> >>> On 11/2/2022 6:22 AM, Borislav Petkov wrote: >>>> On Mon, Oct 31, 2022 at 04:58:38PM -0500, Kalra, Ashish wrote: >>>>> if (snp_lookup_rmpentry(pfn, &rmp_level)) { >>>>> do_sigbus(regs, error_code, address, VM_FAULT_SIGBUS); >>>>> return RMP_PF_RETRY; >>>> >>>> Does this issue some halfway understandable error message why the >>>> process got killed? >>>> >>>>> Will look at adding our own recovery function for the same, but >>>>> that will >>>>> again mark the pages as poisoned, right ? >>>> >>>> Well, not poisoned but PG_offlimits or whatever the mm folks agree >>>> upon. >>>> Semantically, it'll be handled the same way, ofc. >>> >>> Added a new PG_offlimits flag and a simple corresponding handler for it. >> >> One thing is, there's not enough page flags to be adding more (except >> aliases for existing) for cases that can avoid it, but as Boris says, if >> using alias to PG_hwpoison it depends what will become confused with the >> actual hwpoison. >> >>> But there is still added complexity of handling hugepages as part of >>> reclamation failures (both HugeTLB and transparent hugepages) and that >>> means calling more static functions in mm/memory_failure.c >>> >>> There is probably a more appropriate handler in mm/memory-failure.c: >>> >>> soft_offline_page() - this will mark the page as HWPoisoned and also has >>> handling for hugepages. And we can avoid adding a new page flag too. >>> >>> soft_offline_page - Soft offline a page. >>> Soft offline a page, by migration or invalidation, without killing >>> anything. >>> >>> So, this looks like a good option to call >>> soft_offline_page() instead of memory_failure() in case of >>> failure to transition the page back to HV/shared state via >>> SNP_RECLAIM_CMD >>> and/or RMPUPDATE instruction. >> >> So it's a bit unclear to me what exact situation we are handling here. >> The >> original patch here seems to me to be just leaking back pages that are >> unsafe for further use. soft_offline_page() seems to fit that scenario >> of a >> graceful leak before something is irrepairably corrupt and we page >> fault on it. >> But then in the thread you discus PF handling and killing. So what is the >> case here? If we detect this need to call snp_leak_pages() does it mean: >> >> a) nobody that could page fault at them (the guest?) is running >> anymore, we >> are tearing it down, we just can't reuse the pages further on the host > > The host can page fault on them, if anything on the host tries to write > to these pages. Host reads will return garbage data. > >> - seem like soft_offline_page() could work, but maybe we could just >> put the >> pages on some leaked lists without special page? The only thing that >> should >> matter is not to free the pages to the page allocator so they would be >> reused by something else. >> >> b) something can stil page fault at them (what?) - AFAIU can't be >> resolved >> without killing something, memory_failure() might limit the damage > > As i mentioned above, host writes will cause RMP violation page fault. > And to add here, if its a guest private page, then the above fault cannot be resolved, so the faulting process is terminated. Thanks, Ashish > >>> >>>> >>>>> Still waiting for some/more feedback from mm folks on the same. >>>> >>>> Just send the patch and they'll give it. >>>> >>>> Thx. >>>> >>
Hello Boris, On 11/15/2022 10:27 AM, Borislav Petkov wrote: > And, > > as dhansen connected the dots, this should be the exact same protection > scenario as UPM: > > https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flore.kernel.org%2Fall%2F20221025151344.3784230-1-chao.p.peng%40linux.intel.com&data=05%7C01%7Cashish.kalra%40amd.com%7Cbfecf32a51eb499b526d08dac726491e%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C638041264625164355%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&sdata=3RqOC3b9qn0%2B2IRsTZURBmhAVtOn7rARR6fOMOsrFpE%3D&reserved=0 > > so you should be able to mark them inaccessible the same way and you > won't need any poisoning dance. With UPM, the guest pages are all still freed back to the host after guest shutdown, so it's not clear how this would help with handling of leaked pages, for e.g, the host can still access these pages once the guest is shutdown and it will cause the RMP violation #PF at that point. Additionally, our use case is of host allocated firmware pages as part of the crypto driver (to be passed to SNP firmware api calls and then re-transitioned back to host state on return) so these are not guest private pages in the true sense and they need to be handled differently in case there is a failure in reclaiming them. Can you elaborate on what you have in mind ? Thanks, Ashish > > And Michael has patches so you probably should talk to him... > > Thx. >
On Tue, Nov 15, 2022 at 04:14:42PM +0100, Vlastimil Babka wrote: > Cc'ing memory failure folks, the beinning of this subthread is here: > > https://lore.kernel.org/all/3a51840f6a80c87b39632dc728dbd9b5dd444cd7.1655761627.git.ashish.kalra@amd.com/ > > On 11/15/22 00:36, Kalra, Ashish wrote: > > Hello Boris, > > > > On 11/2/2022 6:22 AM, Borislav Petkov wrote: > >> On Mon, Oct 31, 2022 at 04:58:38PM -0500, Kalra, Ashish wrote: > >>> if (snp_lookup_rmpentry(pfn, &rmp_level)) { > >>> do_sigbus(regs, error_code, address, VM_FAULT_SIGBUS); > >>> return RMP_PF_RETRY; > >> > >> Does this issue some halfway understandable error message why the > >> process got killed? > >> > >>> Will look at adding our own recovery function for the same, but that will > >>> again mark the pages as poisoned, right ? > >> > >> Well, not poisoned but PG_offlimits or whatever the mm folks agree upon. > >> Semantically, it'll be handled the same way, ofc. > > > > Added a new PG_offlimits flag and a simple corresponding handler for it. > > One thing is, there's not enough page flags to be adding more (except > aliases for existing) for cases that can avoid it, but as Boris says, if > using alias to PG_hwpoison it depends what will become confused with the > actual hwpoison. I agree with this. Just defining PG_offlimits as an alias of PG_hwpoison could break current hwpoison workload. So if you finally decide to go forward in this direction, you may as well have some indicator to distinguish the new kind of leaked pages from hwpoisoned pages. I don't remember exact thread, but I've read someone writing about similar kind of suggestion of using memory_failure() to make pages inaccessible in non-memory error usecase. I feel that it could be possible to generalize memory_failure() as general-purpose page offlining (by renaming it with hard_offline_page() and making memory_failure() one of the user of it). Thanks, Naoya Horiguchi
On 11/15/22 19:15, Kalra, Ashish wrote: > > On 11/15/2022 11:24 AM, Kalra, Ashish wrote: >> Hello Vlastimil, >> >> On 11/15/2022 9:14 AM, Vlastimil Babka wrote: >>> Cc'ing memory failure folks, the beinning of this subthread is here: >>> >>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flore.kernel.org%2Fall%2F3a51840f6a80c87b39632dc728dbd9b5dd444cd7.1655761627.git.ashish.kalra%40amd.com%2F&data=05%7C01%7Cashish.kalra%40amd.com%7C944b59f239c541a52ac808dac71c2089%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C638041220947600149%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&sdata=do9zzyMlAErkKx5rguqnL2GoG4lhsWHDI74zgwLWaZU%3D&reserved=0 >>> >>> On 11/15/22 00:36, Kalra, Ashish wrote: >>>> Hello Boris, >>>> >>>> On 11/2/2022 6:22 AM, Borislav Petkov wrote: >>>>> On Mon, Oct 31, 2022 at 04:58:38PM -0500, Kalra, Ashish wrote: >>>>>> if (snp_lookup_rmpentry(pfn, &rmp_level)) { >>>>>> do_sigbus(regs, error_code, address, VM_FAULT_SIGBUS); >>>>>> return RMP_PF_RETRY; >>>>> >>>>> Does this issue some halfway understandable error message why the >>>>> process got killed? >>>>> >>>>>> Will look at adding our own recovery function for the same, but that will >>>>>> again mark the pages as poisoned, right ? >>>>> >>>>> Well, not poisoned but PG_offlimits or whatever the mm folks agree upon. >>>>> Semantically, it'll be handled the same way, ofc. >>>> >>>> Added a new PG_offlimits flag and a simple corresponding handler for it. >>> >>> One thing is, there's not enough page flags to be adding more (except >>> aliases for existing) for cases that can avoid it, but as Boris says, if >>> using alias to PG_hwpoison it depends what will become confused with the >>> actual hwpoison. >>> >>>> But there is still added complexity of handling hugepages as part of >>>> reclamation failures (both HugeTLB and transparent hugepages) and that >>>> means calling more static functions in mm/memory_failure.c >>>> >>>> There is probably a more appropriate handler in mm/memory-failure.c: >>>> >>>> soft_offline_page() - this will mark the page as HWPoisoned and also has >>>> handling for hugepages. And we can avoid adding a new page flag too. >>>> >>>> soft_offline_page - Soft offline a page. >>>> Soft offline a page, by migration or invalidation, without killing >>>> anything. >>>> >>>> So, this looks like a good option to call >>>> soft_offline_page() instead of memory_failure() in case of >>>> failure to transition the page back to HV/shared state via SNP_RECLAIM_CMD >>>> and/or RMPUPDATE instruction. >>> >>> So it's a bit unclear to me what exact situation we are handling here. The >>> original patch here seems to me to be just leaking back pages that are >>> unsafe for further use. soft_offline_page() seems to fit that scenario of a >>> graceful leak before something is irrepairably corrupt and we page fault >>> on it. >>> But then in the thread you discus PF handling and killing. So what is the >>> case here? If we detect this need to call snp_leak_pages() does it mean: >>> >>> a) nobody that could page fault at them (the guest?) is running anymore, we >>> are tearing it down, we just can't reuse the pages further on the host >> >> The host can page fault on them, if anything on the host tries to write to >> these pages. Host reads will return garbage data. >> >>> - seem like soft_offline_page() could work, but maybe we could just put the >>> pages on some leaked lists without special page? The only thing that should >>> matter is not to free the pages to the page allocator so they would be >>> reused by something else. >>> >>> b) something can stil page fault at them (what?) - AFAIU can't be resolved >>> without killing something, memory_failure() might limit the damage >> >> As i mentioned above, host writes will cause RMP violation page fault. >> > > And to add here, if its a guest private page, then the above fault cannot be > resolved, so the faulting process is terminated. BTW would this not be mostly resolved as part of rebasing to UPM? - host will not have these pages mapped in the first place (both kernel directmap and qemu userspace) - guest will have them mapped, but I assume that the conversion from private to shared (that might fail?) can only happen after guest's mappings are invalidated in the first place? > Thanks, > Ashish > >> >>>> >>>>> >>>>>> Still waiting for some/more feedback from mm folks on the same. >>>>> >>>>> Just send the patch and they'll give it. >>>>> >>>>> Thx. >>>>> >>>
On 11/16/2022 3:08 AM, Vlastimil Babka wrote: > On 11/15/22 19:15, Kalra, Ashish wrote: >> >> On 11/15/2022 11:24 AM, Kalra, Ashish wrote: >>> Hello Vlastimil, >>> >>> On 11/15/2022 9:14 AM, Vlastimil Babka wrote: >>>> Cc'ing memory failure folks, the beinning of this subthread is here: >>>> >>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flore.kernel.org%2Fall%2F3a51840f6a80c87b39632dc728dbd9b5dd444cd7.1655761627.git.ashish.kalra%40amd.com%2F&data=05%7C01%7Cashish.kalra%40amd.com%7C38f8b76238014c67049308dac7b213a5%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C638041865033588985%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&sdata=DDm7tZhUJLy%2BMzS1SXUnsBmBQnAI5dqR6tWZhCKRMEI%3D&reserved=0 >>>> >>>> On 11/15/22 00:36, Kalra, Ashish wrote: >>>>> Hello Boris, >>>>> >>>>> On 11/2/2022 6:22 AM, Borislav Petkov wrote: >>>>>> On Mon, Oct 31, 2022 at 04:58:38PM -0500, Kalra, Ashish wrote: >>>>>>> if (snp_lookup_rmpentry(pfn, &rmp_level)) { >>>>>>> do_sigbus(regs, error_code, address, VM_FAULT_SIGBUS); >>>>>>> return RMP_PF_RETRY; >>>>>> >>>>>> Does this issue some halfway understandable error message why the >>>>>> process got killed? >>>>>> >>>>>>> Will look at adding our own recovery function for the same, but that will >>>>>>> again mark the pages as poisoned, right ? >>>>>> >>>>>> Well, not poisoned but PG_offlimits or whatever the mm folks agree upon. >>>>>> Semantically, it'll be handled the same way, ofc. >>>>> >>>>> Added a new PG_offlimits flag and a simple corresponding handler for it. >>>> >>>> One thing is, there's not enough page flags to be adding more (except >>>> aliases for existing) for cases that can avoid it, but as Boris says, if >>>> using alias to PG_hwpoison it depends what will become confused with the >>>> actual hwpoison. >>>> >>>>> But there is still added complexity of handling hugepages as part of >>>>> reclamation failures (both HugeTLB and transparent hugepages) and that >>>>> means calling more static functions in mm/memory_failure.c >>>>> >>>>> There is probably a more appropriate handler in mm/memory-failure.c: >>>>> >>>>> soft_offline_page() - this will mark the page as HWPoisoned and also has >>>>> handling for hugepages. And we can avoid adding a new page flag too. >>>>> >>>>> soft_offline_page - Soft offline a page. >>>>> Soft offline a page, by migration or invalidation, without killing >>>>> anything. >>>>> >>>>> So, this looks like a good option to call >>>>> soft_offline_page() instead of memory_failure() in case of >>>>> failure to transition the page back to HV/shared state via SNP_RECLAIM_CMD >>>>> and/or RMPUPDATE instruction. >>>> >>>> So it's a bit unclear to me what exact situation we are handling here. The >>>> original patch here seems to me to be just leaking back pages that are >>>> unsafe for further use. soft_offline_page() seems to fit that scenario of a >>>> graceful leak before something is irrepairably corrupt and we page fault >>>> on it. >>>> But then in the thread you discus PF handling and killing. So what is the >>>> case here? If we detect this need to call snp_leak_pages() does it mean: >>>> >>>> a) nobody that could page fault at them (the guest?) is running anymore, we >>>> are tearing it down, we just can't reuse the pages further on the host >>> >>> The host can page fault on them, if anything on the host tries to write to >>> these pages. Host reads will return garbage data. >>> >>>> - seem like soft_offline_page() could work, but maybe we could just put the >>>> pages on some leaked lists without special page? The only thing that should >>>> matter is not to free the pages to the page allocator so they would be >>>> reused by something else. >>>> >>>> b) something can stil page fault at them (what?) - AFAIU can't be resolved >>>> without killing something, memory_failure() might limit the damage >>> >>> As i mentioned above, host writes will cause RMP violation page fault. >>> >> >> And to add here, if its a guest private page, then the above fault cannot be >> resolved, so the faulting process is terminated. > > BTW would this not be mostly resolved as part of rebasing to UPM? > - host will not have these pages mapped in the first place (both kernel > directmap and qemu userspace) > - guest will have them mapped, but I assume that the conversion from private > to shared (that might fail?) can only happen after guest's mappings are > invalidated in the first place? > Yes, that will be true for guest private pages. But then there are host allocated pages for firmware use which will remain in firmware page state or reclaim state if they can't be transitioned back to HV/shared state once the firmware releases them back to the host and accessing them at this point can potentially cause RMP violation #PF. Again i don't think this is going to happen regularly or frequently so it will be a rare error case where the page reclamation, i.e., the transition back to HV/shared state fails and now these pages are no longer safe to be used. Referring back to your thoughts about putting these pages on some leaked pages list, do any such leaked pages list exist currently ? Thanks, Ashish >>> >>>>> >>>>>> >>>>>>> Still waiting for some/more feedback from mm folks on the same. >>>>>> >>>>>> Just send the patch and they'll give it. >>>>>> >>>>>> Thx. >>>>>> >>>> >
On 11/16/22 11:19, Kalra, Ashish wrote: > On 11/16/2022 3:08 AM, Vlastimil Babka wrote: >> On 11/15/22 19:15, Kalra, Ashish wrote: >>> >>> On 11/15/2022 11:24 AM, Kalra, Ashish wrote: >>>> Hello Vlastimil, >>>> >>>> On 11/15/2022 9:14 AM, Vlastimil Babka wrote: >>>>> Cc'ing memory failure folks, the beinning of this subthread is here: >>>>> >>>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flore.kernel.org%2Fall%2F3a51840f6a80c87b39632dc728dbd9b5dd444cd7.1655761627.git.ashish.kalra%40amd.com%2F&data=05%7C01%7Cashish.kalra%40amd.com%7C38f8b76238014c67049308dac7b213a5%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C638041865033588985%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&sdata=DDm7tZhUJLy%2BMzS1SXUnsBmBQnAI5dqR6tWZhCKRMEI%3D&reserved=0 >>>>> >>>>> On 11/15/22 00:36, Kalra, Ashish wrote: >>>>>> Hello Boris, >>>>>> >>>>>> On 11/2/2022 6:22 AM, Borislav Petkov wrote: >>>>>>> On Mon, Oct 31, 2022 at 04:58:38PM -0500, Kalra, Ashish wrote: >>>>>>>> if (snp_lookup_rmpentry(pfn, &rmp_level)) { >>>>>>>> do_sigbus(regs, error_code, address, VM_FAULT_SIGBUS); >>>>>>>> return RMP_PF_RETRY; >>>>>>> >>>>>>> Does this issue some halfway understandable error message why the >>>>>>> process got killed? >>>>>>> >>>>>>>> Will look at adding our own recovery function for the same, but that >>>>>>>> will >>>>>>>> again mark the pages as poisoned, right ? >>>>>>> >>>>>>> Well, not poisoned but PG_offlimits or whatever the mm folks agree upon. >>>>>>> Semantically, it'll be handled the same way, ofc. >>>>>> >>>>>> Added a new PG_offlimits flag and a simple corresponding handler for it. >>>>> >>>>> One thing is, there's not enough page flags to be adding more (except >>>>> aliases for existing) for cases that can avoid it, but as Boris says, if >>>>> using alias to PG_hwpoison it depends what will become confused with the >>>>> actual hwpoison. >>>>> >>>>>> But there is still added complexity of handling hugepages as part of >>>>>> reclamation failures (both HugeTLB and transparent hugepages) and that >>>>>> means calling more static functions in mm/memory_failure.c >>>>>> >>>>>> There is probably a more appropriate handler in mm/memory-failure.c: >>>>>> >>>>>> soft_offline_page() - this will mark the page as HWPoisoned and also has >>>>>> handling for hugepages. And we can avoid adding a new page flag too. >>>>>> >>>>>> soft_offline_page - Soft offline a page. >>>>>> Soft offline a page, by migration or invalidation, without killing >>>>>> anything. >>>>>> >>>>>> So, this looks like a good option to call >>>>>> soft_offline_page() instead of memory_failure() in case of >>>>>> failure to transition the page back to HV/shared state via >>>>>> SNP_RECLAIM_CMD >>>>>> and/or RMPUPDATE instruction. >>>>> >>>>> So it's a bit unclear to me what exact situation we are handling here. The >>>>> original patch here seems to me to be just leaking back pages that are >>>>> unsafe for further use. soft_offline_page() seems to fit that scenario >>>>> of a >>>>> graceful leak before something is irrepairably corrupt and we page fault >>>>> on it. >>>>> But then in the thread you discus PF handling and killing. So what is the >>>>> case here? If we detect this need to call snp_leak_pages() does it mean: >>>>> >>>>> a) nobody that could page fault at them (the guest?) is running >>>>> anymore, we >>>>> are tearing it down, we just can't reuse the pages further on the host >>>> >>>> The host can page fault on them, if anything on the host tries to write to >>>> these pages. Host reads will return garbage data. >>>> >>>>> - seem like soft_offline_page() could work, but maybe we could just put >>>>> the >>>>> pages on some leaked lists without special page? The only thing that >>>>> should >>>>> matter is not to free the pages to the page allocator so they would be >>>>> reused by something else. >>>>> >>>>> b) something can stil page fault at them (what?) - AFAIU can't be resolved >>>>> without killing something, memory_failure() might limit the damage >>>> >>>> As i mentioned above, host writes will cause RMP violation page fault. >>>> >>> >>> And to add here, if its a guest private page, then the above fault cannot be >>> resolved, so the faulting process is terminated. >> >> BTW would this not be mostly resolved as part of rebasing to UPM? >> - host will not have these pages mapped in the first place (both kernel >> directmap and qemu userspace) >> - guest will have them mapped, but I assume that the conversion from private >> to shared (that might fail?) can only happen after guest's mappings are >> invalidated in the first place? >> > > Yes, that will be true for guest private pages. But then there are host > allocated pages for firmware use which will remain in firmware page state or > reclaim state if they can't be transitioned back to HV/shared state once the > firmware releases them back to the host and accessing them at this point can > potentially cause RMP violation #PF. > > Again i don't think this is going to happen regularly or frequently so it > will be a rare error case where the page reclamation, i.e., the transition > back to HV/shared state fails and now these pages are no longer safe to be > used. > > Referring back to your thoughts about putting these pages on some leaked > pages list, do any such leaked pages list exist currently ? Not AFAIK, you could just create a list_head somewhere appropriate (some snp state structure?) and put the pages there, maybe with a counter exposed in debugs. The point would be mostly that if something goes so wrong it would be leaking substantial amounts of memory, we can at least recognize the cause (but I suppose the dmesg will be also full of messages) and e.g. find the pages in a crash dump. > Thanks, > Ashish > >>>> >>>>>> >>>>>>> >>>>>>>> Still waiting for some/more feedback from mm folks on the same. >>>>>>> >>>>>>> Just send the patch and they'll give it. >>>>>>> >>>>>>> Thx. >>>>>>> >>>>> >>
On 11/15/2022 11:19 PM, HORIGUCHI NAOYA(堀口 直也) wrote: > On Tue, Nov 15, 2022 at 04:14:42PM +0100, Vlastimil Babka wrote: >> Cc'ing memory failure folks, the beinning of this subthread is here: >> >> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flore.kernel.org%2Fall%2F3a51840f6a80c87b39632dc728dbd9b5dd444cd7.1655761627.git.ashish.kalra%40amd.com%2F&data=05%7C01%7Cashish.kalra%40amd.com%7C7b2d39d6e2504a8f923608dac792224b%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C638041727879125176%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&sdata=KBJLKhPQP23vmvY%2FNnbjZs8wTJs%2FrF%2BiU54Sdc4Ldx4%3D&reserved=0 >> >> On 11/15/22 00:36, Kalra, Ashish wrote: >>> Hello Boris, >>> >>> On 11/2/2022 6:22 AM, Borislav Petkov wrote: >>>> On Mon, Oct 31, 2022 at 04:58:38PM -0500, Kalra, Ashish wrote: >>>>> if (snp_lookup_rmpentry(pfn, &rmp_level)) { >>>>> do_sigbus(regs, error_code, address, VM_FAULT_SIGBUS); >>>>> return RMP_PF_RETRY; >>>> >>>> Does this issue some halfway understandable error message why the >>>> process got killed? >>>> >>>>> Will look at adding our own recovery function for the same, but that will >>>>> again mark the pages as poisoned, right ? >>>> >>>> Well, not poisoned but PG_offlimits or whatever the mm folks agree upon. >>>> Semantically, it'll be handled the same way, ofc. >>> >>> Added a new PG_offlimits flag and a simple corresponding handler for it. >> >> One thing is, there's not enough page flags to be adding more (except >> aliases for existing) for cases that can avoid it, but as Boris says, if >> using alias to PG_hwpoison it depends what will become confused with the >> actual hwpoison. > > I agree with this. Just defining PG_offlimits as an alias of PG_hwpoison > could break current hwpoison workload. So if you finally decide to go > forward in this direction, you may as well have some indicator to > distinguish the new kind of leaked pages from hwpoisoned pages. > > I don't remember exact thread, but I've read someone writing about similar > kind of suggestion of using memory_failure() to make pages inaccessible in > non-memory error usecase. I feel that it could be possible to generalize > memory_failure() as general-purpose page offlining (by renaming it with But, doesn't memory_failure() also mark the pages as PG_hwpoison, and then using it for these leaked pages will again cause confusion with actual hwpoison ? Thanks, Ashish > hard_offline_page() and making memory_failure() one of the user of it). > > Thanks, > Naoya Horiguchi >
On 11/16/2022 4:25 AM, Vlastimil Babka wrote: > On 11/16/22 11:19, Kalra, Ashish wrote: >> On 11/16/2022 3:08 AM, Vlastimil Babka wrote: >>> On 11/15/22 19:15, Kalra, Ashish wrote: >>>> >>>> On 11/15/2022 11:24 AM, Kalra, Ashish wrote: >>>>> Hello Vlastimil, >>>>> >>>>> On 11/15/2022 9:14 AM, Vlastimil Babka wrote: >>>>>> Cc'ing memory failure folks, the beinning of this subthread is here: >>>>>> >>>>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flore.kernel.org%2Fall%2F3a51840f6a80c87b39632dc728dbd9b5dd444cd7.1655761627.git.ashish.kalra%40amd.com%2F&data=05%7C01%7Cashish.kalra%40amd.com%7C174b7caaf99a473194cd08dac7bcebf3%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C638041911481429347%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&sdata=CFkAXNQqangvCqhnwDyIUJUkfiUrX67OpKDTtLGj6PU%3D&reserved=0 >>>>>> >>>>>> On 11/15/22 00:36, Kalra, Ashish wrote: >>>>>>> Hello Boris, >>>>>>> >>>>>>> On 11/2/2022 6:22 AM, Borislav Petkov wrote: >>>>>>>> On Mon, Oct 31, 2022 at 04:58:38PM -0500, Kalra, Ashish wrote: >>>>>>>>> if (snp_lookup_rmpentry(pfn, &rmp_level)) { >>>>>>>>> do_sigbus(regs, error_code, address, VM_FAULT_SIGBUS); >>>>>>>>> return RMP_PF_RETRY; >>>>>>>> >>>>>>>> Does this issue some halfway understandable error message why the >>>>>>>> process got killed? >>>>>>>> >>>>>>>>> Will look at adding our own recovery function for the same, but that >>>>>>>>> will >>>>>>>>> again mark the pages as poisoned, right ? >>>>>>>> >>>>>>>> Well, not poisoned but PG_offlimits or whatever the mm folks agree upon. >>>>>>>> Semantically, it'll be handled the same way, ofc. >>>>>>> >>>>>>> Added a new PG_offlimits flag and a simple corresponding handler for it. >>>>>> >>>>>> One thing is, there's not enough page flags to be adding more (except >>>>>> aliases for existing) for cases that can avoid it, but as Boris says, if >>>>>> using alias to PG_hwpoison it depends what will become confused with the >>>>>> actual hwpoison. >>>>>> >>>>>>> But there is still added complexity of handling hugepages as part of >>>>>>> reclamation failures (both HugeTLB and transparent hugepages) and that >>>>>>> means calling more static functions in mm/memory_failure.c >>>>>>> >>>>>>> There is probably a more appropriate handler in mm/memory-failure.c: >>>>>>> >>>>>>> soft_offline_page() - this will mark the page as HWPoisoned and also has >>>>>>> handling for hugepages. And we can avoid adding a new page flag too. >>>>>>> >>>>>>> soft_offline_page - Soft offline a page. >>>>>>> Soft offline a page, by migration or invalidation, without killing >>>>>>> anything. >>>>>>> >>>>>>> So, this looks like a good option to call >>>>>>> soft_offline_page() instead of memory_failure() in case of >>>>>>> failure to transition the page back to HV/shared state via >>>>>>> SNP_RECLAIM_CMD >>>>>>> and/or RMPUPDATE instruction. >>>>>> >>>>>> So it's a bit unclear to me what exact situation we are handling here. The >>>>>> original patch here seems to me to be just leaking back pages that are >>>>>> unsafe for further use. soft_offline_page() seems to fit that scenario >>>>>> of a >>>>>> graceful leak before something is irrepairably corrupt and we page fault >>>>>> on it. >>>>>> But then in the thread you discus PF handling and killing. So what is the >>>>>> case here? If we detect this need to call snp_leak_pages() does it mean: >>>>>> >>>>>> a) nobody that could page fault at them (the guest?) is running >>>>>> anymore, we >>>>>> are tearing it down, we just can't reuse the pages further on the host >>>>> >>>>> The host can page fault on them, if anything on the host tries to write to >>>>> these pages. Host reads will return garbage data. >>>>> >>>>>> - seem like soft_offline_page() could work, but maybe we could just put >>>>>> the >>>>>> pages on some leaked lists without special page? The only thing that >>>>>> should >>>>>> matter is not to free the pages to the page allocator so they would be >>>>>> reused by something else. >>>>>> >>>>>> b) something can stil page fault at them (what?) - AFAIU can't be resolved >>>>>> without killing something, memory_failure() might limit the damage >>>>> >>>>> As i mentioned above, host writes will cause RMP violation page fault. >>>>> >>>> >>>> And to add here, if its a guest private page, then the above fault cannot be >>>> resolved, so the faulting process is terminated. >>> >>> BTW would this not be mostly resolved as part of rebasing to UPM? >>> - host will not have these pages mapped in the first place (both kernel >>> directmap and qemu userspace) >>> - guest will have them mapped, but I assume that the conversion from private >>> to shared (that might fail?) can only happen after guest's mappings are >>> invalidated in the first place? >>> >> >> Yes, that will be true for guest private pages. But then there are host >> allocated pages for firmware use which will remain in firmware page state or >> reclaim state if they can't be transitioned back to HV/shared state once the >> firmware releases them back to the host and accessing them at this point can >> potentially cause RMP violation #PF. >> >> Again i don't think this is going to happen regularly or frequently so it >> will be a rare error case where the page reclamation, i.e., the transition >> back to HV/shared state fails and now these pages are no longer safe to be >> used. >> >> Referring back to your thoughts about putting these pages on some leaked >> pages list, do any such leaked pages list exist currently ? > > Not AFAIK, you could just create a list_head somewhere appropriate (some snp > state structure?) and put the pages there, maybe with a counter exposed in > debugs. The point would be mostly that if something goes so wrong it would > be leaking substantial amounts of memory, we can at least recognize the > cause (but I suppose the dmesg will be also full of messages) and e.g. find > the pages in a crash dump. > Ok, so i will work on implementing this leaked pages list and put it on a sev/snp associated structure. Also to add here, we will actually get a not-present #PF instead of the RMP violation #PF on writing to these leaked pages, as these pages would have been removed from the kernel direct map. Thanks, Ashish >> >>>>> >>>>>>> >>>>>>>> >>>>>>>>> Still waiting for some/more feedback from mm folks on the same. >>>>>>>> >>>>>>>> Just send the patch and they'll give it. >>>>>>>> >>>>>>>> Thx. >>>>>>>> >>>>>> >>> >
On 11/16/22 02:25, Vlastimil Babka wrote: >> Referring back to your thoughts about putting these pages on some leaked >> pages list, do any such leaked pages list exist currently ? > Not AFAIK, you could just create a list_head somewhere appropriate (some snp > state structure?) and put the pages there, maybe with a counter exposed in > debugs. The point would be mostly that if something goes so wrong it would > be leaking substantial amounts of memory, we can at least recognize the > cause (but I suppose the dmesg will be also full of messages) and e.g. find > the pages in a crash dump. It might also be worth looking through the places that check PageHWPoison() and making sure that none of them are poking into the page contents. It's also the kind of thing that adding some CONFIG_DEBUG_VM checks might help with. For instance, nobody should ever be kmap*()'ing a private page. The same might even go for pin_user_pages().
On Wed, Nov 16, 2022 at 12:01:11PM -0600, Kalra, Ashish wrote: > Ok, so i will work on implementing this leaked pages list and put it on a > sev/snp associated structure. See __sgx_sanitize_pages() and the poison list there, for an example. > Also to add here, we will actually get a not-present #PF instead of the RMP > violation #PF on writing to these leaked pages, as these pages would have > been removed from the kernel direct map. So if you do the list and still have the kernel raise a RMP fault for those pages, traversing that list in the RMP handler to check whether the page is there on it, should be a lot faster operation than doing the #PF thing and removing them from the direct map. And sorry for misleading you about UPM - we were thinking wrong yesterday. Thx.
On 11/16/2022 12:33 PM, Borislav Petkov wrote: > On Wed, Nov 16, 2022 at 12:01:11PM -0600, Kalra, Ashish wrote: >> Ok, so i will work on implementing this leaked pages list and put it on a >> sev/snp associated structure. > > See __sgx_sanitize_pages() and the poison list there, for an example. > >> Also to add here, we will actually get a not-present #PF instead of the RMP >> violation #PF on writing to these leaked pages, as these pages would have >> been removed from the kernel direct map. > > So if you do the list and still have the kernel raise a RMP fault for > those pages, traversing that list in the RMP handler to check whether > the page is there on it, should be a lot faster operation than doing the > #PF thing and removing them from the direct map. > Actually, these host allocated pages would have already been removed from the kernel direct map, when they were transitioned to the firmware state. So actually the not-present #PF fault will happen on any read/write access to these leaked pages instead of the RMP violation #PF (not-present #PF has higher priority than RMP violation #PF). If these pages cannot be reclaimed, they are unsafe to use and cannot be added back to the kernel direct map. Thanks, Ashish > And sorry for misleading you about UPM - we were thinking wrong > yesterday. > > Thx. >
On Wed, Nov 16, 2022 at 12:53:36PM -0600, Kalra, Ashish wrote: > Actually, these host allocated pages would have already been removed from > the kernel direct map, And, as I said above, it would be a lot easier to handle any potential faults resulting from the host touching them by having it raise a *RMP* fault instead of normal *PF* fault where the latter code is a crazy mess.
On 11/16/2022 1:09 PM, Borislav Petkov wrote: > On Wed, Nov 16, 2022 at 12:53:36PM -0600, Kalra, Ashish wrote: >> Actually, these host allocated pages would have already been removed from >> the kernel direct map, > > And, as I said above, it would be a lot easier to handle any potential > faults resulting from the host touching them by having it raise a *RMP* > fault instead of normal *PF* fault where the latter code is a crazy mess. Just to reiterate here, we won't be getting a *RMP* fault but will instead get a normal (not-present) #PF fault when the host touches these pages. Sorry for any confusion about the fault signaled, earlier i mentioned we will get a RMP violation #PF, but actually as these pages are also removed from the kernel direct-map, therefore, we will get the not-present #PF and not the RMP #PF (core will check and signal not-present #PF before it performs the RMP checks). Thanks, Ashish
On Wed, Nov 16, 2022 at 04:28:11AM -0600, Kalra, Ashish wrote: > On 11/15/2022 11:19 PM, HORIGUCHI NAOYA(堀口 直也) wrote: > > On Tue, Nov 15, 2022 at 04:14:42PM +0100, Vlastimil Babka wrote: > > > Cc'ing memory failure folks, the beinning of this subthread is here: > > > > > > https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flore.kernel.org%2Fall%2F3a51840f6a80c87b39632dc728dbd9b5dd444cd7.1655761627.git.ashish.kalra%40amd.com%2F&data=05%7C01%7Cashish.kalra%40amd.com%7C7b2d39d6e2504a8f923608dac792224b%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C638041727879125176%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&sdata=KBJLKhPQP23vmvY%2FNnbjZs8wTJs%2FrF%2BiU54Sdc4Ldx4%3D&reserved=0 > > > > > > On 11/15/22 00:36, Kalra, Ashish wrote: > > > > Hello Boris, > > > > > > > > On 11/2/2022 6:22 AM, Borislav Petkov wrote: > > > > > On Mon, Oct 31, 2022 at 04:58:38PM -0500, Kalra, Ashish wrote: > > > > > > if (snp_lookup_rmpentry(pfn, &rmp_level)) { > > > > > > do_sigbus(regs, error_code, address, VM_FAULT_SIGBUS); > > > > > > return RMP_PF_RETRY; > > > > > > > > > > Does this issue some halfway understandable error message why the > > > > > process got killed? > > > > > > > > > > > Will look at adding our own recovery function for the same, but that will > > > > > > again mark the pages as poisoned, right ? > > > > > > > > > > Well, not poisoned but PG_offlimits or whatever the mm folks agree upon. > > > > > Semantically, it'll be handled the same way, ofc. > > > > > > > > Added a new PG_offlimits flag and a simple corresponding handler for it. > > > > > > One thing is, there's not enough page flags to be adding more (except > > > aliases for existing) for cases that can avoid it, but as Boris says, if > > > using alias to PG_hwpoison it depends what will become confused with the > > > actual hwpoison. > > > > I agree with this. Just defining PG_offlimits as an alias of PG_hwpoison > > could break current hwpoison workload. So if you finally decide to go > > forward in this direction, you may as well have some indicator to > > distinguish the new kind of leaked pages from hwpoisoned pages. > > > > I don't remember exact thread, but I've read someone writing about similar > > kind of suggestion of using memory_failure() to make pages inaccessible in > > non-memory error usecase. I feel that it could be possible to generalize > > memory_failure() as general-purpose page offlining (by renaming it with > > But, doesn't memory_failure() also mark the pages as PG_hwpoison, and then > using it for these leaked pages will again cause confusion with actual > hwpoison ? Yes, so we might need modification of memory_failure code for this approach like renaming PG_hwpoison to more generic one (although some possible names like PageOffline and PageIsolated are already used) and/or somehow showing "which kind of leaked pages" info. Thanks, Naoya Horiguchi
Hello Boris, >> >>> + if (ret) >>> + goto cleanup; >>> + >>> + ret = rmp_make_shared(pfn, PG_LEVEL_4K); >>> + if (ret) >>> + goto cleanup; >>> + >>> + pfn++; >>> + n++; >>> + } >>> + >>> + return 0; >>> + >>> +cleanup: >>> + /* >>> + * If failed to reclaim the page then page is no longer safe to >>> + * be released, leak it. >>> + */ >>> + snp_leak_pages(pfn, npages - n); >> >> So this looks real weird: we go and reclaim pages, we hit an error >> during reclaiming a page X somewhere in-between and then we go and mark >> the *remaining* pages as not to be used?! >> >> Why? >> >> Why not only that *one* page which failed and then we continue with the >> rest?! > I had a re-look at this while fixing the memory_failure() handling and realized that we can't do a *partial* recovery here if we fail to reclaim a single page, i.e., if we hit an error during reclaiming a single page we need to mark the remaining pages as not usable. This is because this page could be a part of larger buffer which would have been transitioned to firmware state and need to be restored back in *full* to HV/shared state, any access to a partially transitioned buffer will still cause failures, basically the callers won't be able to do any kind of a recovery/access on a partially restored/reclaimed buffer and now potentially fragmented buffer, which anyway means failure due to data loss on non reclaimed page(s). So we need to be able to reclaim all the pages or none. Also this failure won't be happening regularly/frequently, it is a *rare* error case and if there is a reclamation failure on a single page, there is a high probability that there will be reclamation failures on subsequent pages. Thanks, Ashish
On Thu, Nov 17, 2022 at 02:56:47PM -0600, Kalra, Ashish wrote:
> So we need to be able to reclaim all the pages or none.
/me goes and looks at SNP_PAGE_RECLAIM's retvals:
- INVALID_PLATFORM_STATE - platform is not in INIT state. That's
certainly not a reason to leak pages.
- INVALID_ADDRESS - PAGE_PADDR is not a valid system physical address.
That's botched command buffer but not a broken page so no reason to leak
them either.
- INVALID_PAGE_STATE - the page is neither of those types: metadata,
firmware, pre-guest nor pre-swap. So if you issue page reclaim on the
wrong range of pages that looks again like a user error but no need to
leak pages.
- INVALID_PAGE_SIZE - a size mismatch. Still sounds to me like a user
error of sev-guest instead of anything wrong deeper in the FW or HW.
So in all those, if you end up supplying the wrong range of addresses,
you most certainly will end up leaking the wrong pages.
So it sounds to me like you wanna say: "Error reclaiming range, check
your driver" instead of punishing any innocent pages.
Now, if the retval from the fw were FIRMWARE_INTERNAL_ERROR or so, then
sure, by all means. But not for the above. All the error conditions
above sound like the kernel has supplied the wrong range/botched command
buffer to the firmware so there's no need to leak pages.
Thx.
Hello Boris, On 11/20/2022 3:34 PM, Borislav Petkov wrote: > On Thu, Nov 17, 2022 at 02:56:47PM -0600, Kalra, Ashish wrote: >> So we need to be able to reclaim all the pages or none. > > /me goes and looks at SNP_PAGE_RECLAIM's retvals: > > - INVALID_PLATFORM_STATE - platform is not in INIT state. That's > certainly not a reason to leak pages. This should not happen, as there are sev->snp_initialized checks before any firmware page allocation or snp page transitions. > > - INVALID_ADDRESS - PAGE_PADDR is not a valid system physical address. > That's botched command buffer but not a broken page so no reason to leak > them either. > > - INVALID_PAGE_STATE - the page is neither of those types: metadata, > firmware, pre-guest nor pre-swap. So if you issue page reclaim on the > wrong range of pages that looks again like a user error but no need to > leak pages. > > - INVALID_PAGE_SIZE - a size mismatch. Still sounds to me like a user > error of sev-guest instead of anything wrong deeper in the FW or HW. > > So in all those, if you end up supplying the wrong range of addresses, > you most certainly will end up leaking the wrong pages. > > So it sounds to me like you wanna say: "Error reclaiming range, check > your driver" instead of punishing any innocent pages. I agree, but these pages are not in the right state to be released back to the system or accessed by the host, because they have already been transitioned successfully to firmware state and the reclaim has failed. If we release them back to page-allocator and whenever the host accesses them, it will get a not-present #PF and it will panic/crash the host process. It might be a user/sev-guest error, but these pages are now unsafe to use. So is a kernel panic justified here, instead of not releasing the pages back to host and logging errors for the same. Thanks, Ashish > > Now, if the retval from the fw were FIRMWARE_INTERNAL_ERROR or so, then > sure, by all means. But not for the above. All the error conditions > above sound like the kernel has supplied the wrong range/botched command > buffer to the firmware so there's no need to leak pages. > > Thx. >
On Mon, Nov 21, 2022 at 06:37:18PM -0600, Kalra, Ashish wrote: > I agree, but these pages are not in the right state to be released back to Which pages exactly? Some pages' state has really changed underneath or you've given the wrong range? > It might be a user/sev-guest error, but these pages are now unsafe to use. > So is a kernel panic justified here, instead of not releasing the pages back > to host and logging errors for the same. Ok, there are two cases: * kernel error: I guess a big fat warning is the least we can issue here. Not sure about panic considering this should almost never happen and a warning would allow for people to catch dumps and debug the issue. * firmware error: I don't think you can know that that is really the case on a production system without additional fw debugging capabilities. Dumping a warning would be the least we can do here too, to signal that something's out of the ordinary and so people can look into it further. So yeah, a big fat warning is a good start. And then you don't need any memory poisoning etc gunk.
Hello Boris, On 11/22/2022 4:17 AM, Borislav Petkov wrote: > On Mon, Nov 21, 2022 at 06:37:18PM -0600, Kalra, Ashish wrote: >> I agree, but these pages are not in the right state to be released back to > > Which pages exactly? > > Some pages' state has really changed underneath or you've given the > wrong range? > >> It might be a user/sev-guest error, but these pages are now unsafe to use. >> So is a kernel panic justified here, instead of not releasing the pages back >> to host and logging errors for the same. > > Ok, there are two cases: > > * kernel error: I guess a big fat warning is the least we can issue > here. Not sure about panic considering this should almost never happen > and a warning would allow for people to catch dumps and debug the issue. > > * firmware error: I don't think you can know that that is really > the case on a production system without additional fw debugging > capabilities. Dumping a warning would be the least we can do here too, > to signal that something's out of the ordinary and so people can look > into it further. Please note that in both cases, these non-reclaimed pages cannot be freed/returned back to the page allocator. Anytime the kernel accesses these pages it will cause a panic or host process crash. So along with warning, the pages will be added to a leaked pages list, but there is no poisoning or anything, only need to ensure that these pages are not touched/accessed again. Thanks, Ashish > > So yeah, a big fat warning is a good start. And then you don't need any > memory poisoning etc gunk. >
On Tue, Nov 22, 2022 at 04:32:18AM -0600, Kalra, Ashish wrote: > Please note that in both cases, these non-reclaimed pages cannot be > freed/returned back to the page allocator. You keep repeating "these pages". Which pages? What if you specify the wrong, innocent pages because the kernel is wrong?
On 11/22/2022 4:44 AM, Borislav Petkov wrote: > On Tue, Nov 22, 2022 at 04:32:18AM -0600, Kalra, Ashish wrote: >> Please note that in both cases, these non-reclaimed pages cannot be >> freed/returned back to the page allocator. > > You keep repeating "these pages". Which pages? The pages which have been allocated for firmware use (such as for SNP_INIT command, TMR memory for SEV-ES usage, etc), command buffers used for SEV legacy commands when SNP is enabled. Here is a detailed description of the SEV legacy command handling when SNP is enabled: The behavior of the SEV-legacy commands is altered when the SNP firmware is in the INIT state. When SNP is in INIT state, all the SEV-legacy commands that cause the firmware to write to memory must be in the firmware state before issuing the command. A command buffer may contain a system physical address that the firmware may write to. In case the command buffer contains a system physical address points to a guest memory, we need to change the page state to the firmware in the RMP table before issuing the command and restore the state to shared after the command completes. Then there are host buffers allocated for SNP platform status command, SNP launch update and SNP launch update vmsa command. The other pages which can be user or guest provided are SNP guest requests and SNP guest debug helpers. It is important to note that if invalid address/len are supplied, the failure will happen at the initial stage itself of transitioning these pages to firmware state. But if the above pages have been successfully transitioned to firmware state and passed on to the SNP firmware, then after return, they need to be restored to shared state. If this restoration/reclamation fails, then accessing these pages will cause the kernel to panic. > > What if you specify the wrong, innocent pages because the kernel is > wrong? > In such a case the kernel panic is justifiable, but again if incorrect addresses are supplied, the failure will happen at the initial stage of transitioning these pages to firmware state and there is no need to reclaim. Or, otherwise dump a warning and let the pages not be freed/returned back to the page allocator. It is either innocent pages or kernel panic or an innocent host process crash (these are the choices to make). Thanks, Ashish
On Tue, Nov 22, 2022 at 05:44:47AM -0600, Kalra, Ashish wrote: > It is important to note that if invalid address/len are supplied, the > failure will happen at the initial stage itself of transitioning these pages > to firmware state. /me goes and checks out your v6 tree based on 5.18. Lemme choose one: static int snp_launch_update(struct kvm *kvm, struct kvm_sev_cmd *argp) { ... inpages = sev_pin_memory(kvm, params.uaddr, params.len, &npages, 1); ... for (i = 0; i < npages; i++) { pfn = page_to_pfn(inpages[i]); ... ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_LAUNCH_UPDATE, &data, error); if (ret) { /* * If the command failed then need to reclaim the page. */ snp_page_reclaim(pfn); and here it would leak the pages if it cannot reclaim them. Now how did you get those? Through params.uaddr and params.len which come from userspace: if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, sizeof(params))) return -EFAULT; Now, think about it, can userspace be trusted? Exactly. Yeah, yeah, I see it does is_hva_registered() but userspace can just as well supply the wrong region which fits. > In such a case the kernel panic is justifiable, So userspace can supply whatever it wants and you'd panic? You surely don't mean that. > but again if incorrect addresses are supplied, the failure will happen > at the initial stage of transitioning these pages to firmware state > and there is no need to reclaim. See above. > Or, otherwise dump a warning and let the pages not be freed/returned > back to the page allocator. > > It is either innocent pages or kernel panic or an innocent host > process crash (these are the choices to make). No, it is make the kernel as resilient as possible. Which means, no panic, add the pages to a not-to-be-used-anymore list and scream loudly with warning messages when it must leak pages so that people can fix the issue. Ok?
Hello Boris, On 11/23/2022 5:40 AM, Borislav Petkov wrote: > On Tue, Nov 22, 2022 at 05:44:47AM -0600, Kalra, Ashish wrote: >> It is important to note that if invalid address/len are supplied, the >> failure will happen at the initial stage itself of transitioning these pages >> to firmware state. > > /me goes and checks out your v6 tree based on 5.18. > > Lemme choose one: > > static int snp_launch_update(struct kvm *kvm, struct kvm_sev_cmd *argp) > { > ... > > inpages = sev_pin_memory(kvm, params.uaddr, params.len, &npages, 1); > > ... > > for (i = 0; i < npages; i++) { > pfn = page_to_pfn(inpages[i]); > > ... > > ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_LAUNCH_UPDATE, &data, error); > if (ret) { > /* > * If the command failed then need to reclaim the page. > */ > snp_page_reclaim(pfn); > > and here it would leak the pages if it cannot reclaim them. > > Now how did you get those? > > Through params.uaddr and params.len which come from userspace: > > if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, sizeof(params))) > return -EFAULT; > > > Now, think about it, can userspace be trusted? > > Exactly. > > Yeah, yeah, I see it does is_hva_registered() but userspace can just as > well supply the wrong region which fits. Yes, that's right. Also, before sev_issue_cmd() above, there is a call to rmp_make_private() to make these pages transition to firmware state before we issue the LAUNCH_UPDATE command as below: ret = rmp_make_private(pfn, gfn << PAGE_SHIFT, level, sev_get_asid(kvm), true); if (ret) { ret = -EFAULT; goto e_unpin; } ... ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_LAUNCH_UPDATE, &data, error); So in case, the userspace provided an invalid/incorrect range, this transition would have failed and there would not have been a need to do any reclaim, so there are no pages leaked here. This is also the reason why we need to reclaim pages if the subsequent LAUNCH_UPDATE command fails, as now the pages are in F/W state because of the rmp_make_private() call and they are now unsafe to be used by the host. > >> In such a case the kernel panic is justifiable, > > So userspace can supply whatever it wants and you'd panic? > > You surely don't mean that. > No, we don't want to do that. >> but again if incorrect addresses are supplied, the failure will happen >> at the initial stage of transitioning these pages to firmware state >> and there is no need to reclaim. This is the case i mentioned above, rmp_make_private() before the firmware command is the initial stage of transitioning the pages to firmware state before issuing the firmware command. > > See above. > >> Or, otherwise dump a warning and let the pages not be freed/returned >> back to the page allocator. >> >> It is either innocent pages or kernel panic or an innocent host >> process crash (these are the choices to make). > > No, it is make the kernel as resilient as possible. Which means, no > panic, add the pages to a not-to-be-used-anymore list and scream loudly > with warning messages when it must leak pages so that people can fix the > issue. > > Ok? > Right, yes, i totally agree. So now we are adding these pages to an internal not-to-be-used-anymore list and printing warnings and ensuring no panics as we don't allow these pages to be released back to the page allocator. Thanks, Ashish
diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c index 35d76333e120..0dbd99f29b25 100644 --- a/drivers/crypto/ccp/sev-dev.c +++ b/drivers/crypto/ccp/sev-dev.c @@ -79,6 +79,14 @@ static void *sev_es_tmr; #define NV_LENGTH (32 * 1024) static void *sev_init_ex_buffer; +/* When SEV-SNP is enabled the TMR needs to be 2MB aligned and 2MB size. */ +#define SEV_SNP_ES_TMR_SIZE (2 * 1024 * 1024) + +static size_t sev_es_tmr_size = SEV_ES_TMR_SIZE; + +static int __sev_do_cmd_locked(int cmd, void *data, int *psp_ret); +static int sev_do_cmd(int cmd, void *data, int *psp_ret); + static inline bool sev_version_greater_or_equal(u8 maj, u8 min) { struct sev_device *sev = psp_master->sev_data; @@ -177,11 +185,161 @@ static int sev_cmd_buffer_len(int cmd) return 0; } +static void snp_leak_pages(unsigned long pfn, unsigned int npages) +{ + WARN(1, "psc failed, pfn 0x%lx pages %d (leaking)\n", pfn, npages); + while (npages--) { + memory_failure(pfn, 0); + dump_rmpentry(pfn); + pfn++; + } +} + +static int snp_reclaim_pages(unsigned long pfn, unsigned int npages, bool locked) +{ + struct sev_data_snp_page_reclaim data; + int ret, err, i, n = 0; + + for (i = 0; i < npages; i++) { + memset(&data, 0, sizeof(data)); + data.paddr = pfn << PAGE_SHIFT; + + if (locked) + ret = __sev_do_cmd_locked(SEV_CMD_SNP_PAGE_RECLAIM, &data, &err); + else + ret = sev_do_cmd(SEV_CMD_SNP_PAGE_RECLAIM, &data, &err); + if (ret) + goto cleanup; + + ret = rmp_make_shared(pfn, PG_LEVEL_4K); + if (ret) + goto cleanup; + + pfn++; + n++; + } + + return 0; + +cleanup: + /* + * If failed to reclaim the page then page is no longer safe to + * be released, leak it. + */ + snp_leak_pages(pfn, npages - n); + return ret; +} + +static inline int rmp_make_firmware(unsigned long pfn, int level) +{ + return rmp_make_private(pfn, 0, level, 0, true); +} + +static int snp_set_rmp_state(unsigned long paddr, unsigned int npages, bool to_fw, bool locked, + bool need_reclaim) +{ + unsigned long pfn = __sme_clr(paddr) >> PAGE_SHIFT; /* Cbit maybe set in the paddr */ + int rc, n = 0, i; + + for (i = 0; i < npages; i++) { + if (to_fw) + rc = rmp_make_firmware(pfn, PG_LEVEL_4K); + else + rc = need_reclaim ? snp_reclaim_pages(pfn, 1, locked) : + rmp_make_shared(pfn, PG_LEVEL_4K); + if (rc) + goto cleanup; + + pfn++; + n++; + } + + return 0; + +cleanup: + /* Try unrolling the firmware state changes */ + if (to_fw) { + /* + * Reclaim the pages which were already changed to the + * firmware state. + */ + snp_reclaim_pages(paddr >> PAGE_SHIFT, n, locked); + + return rc; + } + + /* + * If failed to change the page state to shared, then its not safe + * to release the page back to the system, leak it. + */ + snp_leak_pages(pfn, npages - n); + + return rc; +} + +static struct page *__snp_alloc_firmware_pages(gfp_t gfp_mask, int order, bool locked) +{ + unsigned long npages = 1ul << order, paddr; + struct sev_device *sev; + struct page *page; + + if (!psp_master || !psp_master->sev_data) + return NULL; + + page = alloc_pages(gfp_mask, order); + if (!page) + return NULL; + + /* If SEV-SNP is initialized then add the page in RMP table. */ + sev = psp_master->sev_data; + if (!sev->snp_inited) + return page; + + paddr = __pa((unsigned long)page_address(page)); + if (snp_set_rmp_state(paddr, npages, true, locked, false)) + return NULL; + + return page; +} + +void *snp_alloc_firmware_page(gfp_t gfp_mask) +{ + struct page *page; + + page = __snp_alloc_firmware_pages(gfp_mask, 0, false); + + return page ? page_address(page) : NULL; +} +EXPORT_SYMBOL_GPL(snp_alloc_firmware_page); + +static void __snp_free_firmware_pages(struct page *page, int order, bool locked) +{ + unsigned long paddr, npages = 1ul << order; + + if (!page) + return; + + paddr = __pa((unsigned long)page_address(page)); + if (snp_set_rmp_state(paddr, npages, false, locked, true)) + return; + + __free_pages(page, order); +} + +void snp_free_firmware_page(void *addr) +{ + if (!addr) + return; + + __snp_free_firmware_pages(virt_to_page(addr), 0, false); +} +EXPORT_SYMBOL(snp_free_firmware_page); + static void *sev_fw_alloc(unsigned long len) { struct page *page; - page = alloc_pages(GFP_KERNEL, get_order(len)); + page = __snp_alloc_firmware_pages(GFP_KERNEL, get_order(len), false); if (!page) return NULL; @@ -393,7 +551,7 @@ static int __sev_init_locked(int *error) data.tmr_address = __pa(sev_es_tmr); data.flags |= SEV_INIT_FLAGS_SEV_ES; - data.tmr_len = SEV_ES_TMR_SIZE; + data.tmr_len = sev_es_tmr_size; } return __sev_do_cmd_locked(SEV_CMD_INIT, &data, error); @@ -421,7 +579,7 @@ static int __sev_init_ex_locked(int *error) data.tmr_address = __pa(sev_es_tmr); data.flags |= SEV_INIT_FLAGS_SEV_ES; - data.tmr_len = SEV_ES_TMR_SIZE; + data.tmr_len = sev_es_tmr_size; } return __sev_do_cmd_locked(SEV_CMD_INIT_EX, &data, error); @@ -818,6 +976,8 @@ static int __sev_snp_init_locked(int *error) sev->snp_inited = true; dev_dbg(sev->dev, "SEV-SNP firmware initialized\n"); + sev_es_tmr_size = SEV_SNP_ES_TMR_SIZE; + return rc; } @@ -1341,8 +1501,9 @@ static void sev_firmware_shutdown(struct sev_device *sev) /* The TMR area was encrypted, flush it from the cache */ wbinvd_on_all_cpus(); - free_pages((unsigned long)sev_es_tmr, - get_order(SEV_ES_TMR_SIZE)); + __snp_free_firmware_pages(virt_to_page(sev_es_tmr), + get_order(sev_es_tmr_size), + false); sev_es_tmr = NULL; } @@ -1430,7 +1591,7 @@ void sev_pci_init(void) } /* Obtain the TMR memory area for SEV-ES use */ - sev_es_tmr = sev_fw_alloc(SEV_ES_TMR_SIZE); + sev_es_tmr = sev_fw_alloc(sev_es_tmr_size); if (!sev_es_tmr) dev_warn(sev->dev, "SEV: TMR allocation failed, SEV-ES support unavailable\n"); diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h index 9f921d221b75..a3bb792bb842 100644 --- a/include/linux/psp-sev.h +++ b/include/linux/psp-sev.h @@ -12,6 +12,8 @@ #ifndef __PSP_SEV_H__ #define __PSP_SEV_H__ +#include <linux/sev.h> + #include <uapi/linux/psp-sev.h> #ifdef CONFIG_X86 @@ -940,6 +942,8 @@ int snp_guest_page_reclaim(struct sev_data_snp_page_reclaim *data, int *error); int snp_guest_dbg_decrypt(struct sev_data_snp_dbg *data, int *error); void *psp_copy_user_blob(u64 uaddr, u32 len); +void *snp_alloc_firmware_page(gfp_t mask); +void snp_free_firmware_page(void *addr); #else /* !CONFIG_CRYPTO_DEV_SP_PSP */ @@ -981,6 +985,13 @@ static inline int snp_guest_dbg_decrypt(struct sev_data_snp_dbg *data, int *erro return -ENODEV; } +static inline void *snp_alloc_firmware_page(gfp_t mask) +{ + return NULL; +} + +static inline void snp_free_firmware_page(void *addr) { } + #endif /* CONFIG_CRYPTO_DEV_SP_PSP */ #endif /* __PSP_SEV_H__ */