diff mbox series

[v3] x86/sgx: Free backing memory after faulting the enclave page

Message ID 20220108140510.76583-1-jarkko@kernel.org (mailing list archive)
State New, archived
Headers show
Series [v3] x86/sgx: Free backing memory after faulting the enclave page | expand

Commit Message

Jarkko Sakkinen Jan. 8, 2022, 2:05 p.m. UTC
There is a limited amount of SGX memory (EPC) on each system.  When that
memory is used up, SGX has its own swapping mechanism which is similar
in concept but totally separate from the core mm/* code.  Instead of
swapping to disk, SGX swaps from EPC to normal RAM.  That normal RAM
comes from a shared memory pseudo-file and can itself be swapped by the
core mm code.  There is a hierarchy like this:

	EPC <-> shmem <-> disk

After data is swapped back in from shmem to EPC, the shmem backing
storage needs to be freed.  Currently, the backing shmem is not freed.
This effectively wastes the shmem while the enclave is running.  The
memory is recovered when the enclave is destroyed and the backing
storage freed.

Sort this out by freeing memory with shmem_truncate_range(), as soon as
a page is faulted back to the EPC.  In addition, free the memory for
PCMD pages as soon as all PCMD's in a page have been marked as unused
by zeroing its contents.

Reported-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: stable@vger.kernel.org
Fixes: 1728ab54b4be ("x86/sgx: Add a page reclaimer")
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
---
v3:
* Resend.
v2:
* Rewrite commit message as proposed by Dave.
* Truncate PCMD pages (Dave).
---
 arch/x86/kernel/cpu/sgx/encl.c | 48 +++++++++++++++++++++++++++++++---
 1 file changed, 44 insertions(+), 4 deletions(-)

Comments

Reinette Chatre Jan. 13, 2022, 6:08 a.m. UTC | #1
Hi Jarkko,

On 1/8/2022 6:05 AM, Jarkko Sakkinen wrote:
> There is a limited amount of SGX memory (EPC) on each system.  When that
> memory is used up, SGX has its own swapping mechanism which is similar
> in concept but totally separate from the core mm/* code.  Instead of
> swapping to disk, SGX swaps from EPC to normal RAM.  That normal RAM
> comes from a shared memory pseudo-file and can itself be swapped by the
> core mm code.  There is a hierarchy like this:
> 
> 	EPC <-> shmem <-> disk
> 
> After data is swapped back in from shmem to EPC, the shmem backing
> storage needs to be freed.  Currently, the backing shmem is not freed.
> This effectively wastes the shmem while the enclave is running.  The
> memory is recovered when the enclave is destroyed and the backing
> storage freed.
> 
> Sort this out by freeing memory with shmem_truncate_range(), as soon as
> a page is faulted back to the EPC.  In addition, free the memory for
> PCMD pages as soon as all PCMD's in a page have been marked as unused
> by zeroing its contents.
> 
> Reported-by: Dave Hansen <dave.hansen@linux.intel.com>
> Cc: stable@vger.kernel.org
> Fixes: 1728ab54b4be ("x86/sgx: Add a page reclaimer")
> Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
> ---
> v3:
> * Resend.
> v2:
> * Rewrite commit message as proposed by Dave.
> * Truncate PCMD pages (Dave).
> ---
>  arch/x86/kernel/cpu/sgx/encl.c | 48 +++++++++++++++++++++++++++++++---
>  1 file changed, 44 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c
> index 001808e3901c..ea43c10e5458 100644
> --- a/arch/x86/kernel/cpu/sgx/encl.c
> +++ b/arch/x86/kernel/cpu/sgx/encl.c
> @@ -12,6 +12,27 @@
>  #include "encls.h"
>  #include "sgx.h"
>  
> +
> +/*
> + * Get the page number of the page in the backing storage, which stores the PCMD
> + * of the enclave page in the given page index.  PCMD pages are located after
> + * the backing storage for the visible enclave pages and SECS.
> + */
> +static inline pgoff_t sgx_encl_get_backing_pcmd_nr(struct sgx_encl *encl, pgoff_t index)
> +{
> +	return PFN_DOWN(encl->size) + 1 + (index / sizeof(struct sgx_pcmd));
> +}
> +
> +/*
> + * Free a page from the backing storage in the given page index.
> + */
> +static inline void sgx_encl_truncate_backing_page(struct sgx_encl *encl, pgoff_t index)
> +{
> +	struct inode *inode = file_inode(encl->backing);
> +
> +	shmem_truncate_range(inode, PFN_PHYS(index), PFN_PHYS(index) + PAGE_SIZE - 1);
> +}
> +
>  /*
>   * ELDU: Load an EPC page as unblocked. For more info, see "OS Management of EPC
>   * Pages" in the SDM.
> @@ -24,7 +45,10 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page,
>  	struct sgx_encl *encl = encl_page->encl;
>  	struct sgx_pageinfo pginfo;
>  	struct sgx_backing b;
> +	bool pcmd_page_empty;
>  	pgoff_t page_index;
> +	pgoff_t pcmd_index;
> +	u8 *pcmd_page;
>  	int ret;
>  
>  	if (secs_page)
> @@ -38,8 +62,8 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page,
>  
>  	pginfo.addr = encl_page->desc & PAGE_MASK;
>  	pginfo.contents = (unsigned long)kmap_atomic(b.contents);
> -	pginfo.metadata = (unsigned long)kmap_atomic(b.pcmd) +
> -			  b.pcmd_offset;
> +	pcmd_page = kmap_atomic(b.pcmd);
> +	pginfo.metadata = (unsigned long)pcmd_page + b.pcmd_offset;
>  
>  	if (secs_page)
>  		pginfo.secs = (u64)sgx_get_epc_virt_addr(secs_page);
> @@ -55,11 +79,27 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page,
>  		ret = -EFAULT;
>  	}
>  
> -	kunmap_atomic((void *)(unsigned long)(pginfo.metadata - b.pcmd_offset));
> +	memset(pcmd_page + b.pcmd_offset, 0, sizeof(struct sgx_pcmd));
> +
> +	/*
> +	 * The area for the PCMD in the page was zeroed above.  Check if the
> +	 * whole page is now empty meaning that all PCMD's have been zeroed:
> +	 */
> +	pcmd_page_empty = !memchr_inv(pcmd_page, 0, PAGE_SIZE);
> +
> +	kunmap_atomic(pcmd_page);
>  	kunmap_atomic((void *)(unsigned long)pginfo.contents);
>  
>  	sgx_encl_put_backing(&b, false);
>  
> +	/* Free the backing memory. */
> +	sgx_encl_truncate_backing_page(encl, page_index);
> +
> +	if (pcmd_page_empty) {
> +		pcmd_index = sgx_encl_get_backing_pcmd_nr(encl, page_index);
> +		sgx_encl_truncate_backing_page(encl, pcmd_index);
> +	}
> +
>  	return ret;
>  }
>  
> @@ -577,7 +617,7 @@ static struct page *sgx_encl_get_backing_page(struct sgx_encl *encl,
>  int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index,
>  			 struct sgx_backing *backing)
>  {
> -	pgoff_t pcmd_index = PFN_DOWN(encl->size) + 1 + (page_index >> 5);
> +	pgoff_t pcmd_index = sgx_encl_get_backing_pcmd_nr(encl, page_index);
>  	struct page *contents;
>  	struct page *pcmd;
>  

I applied this patch on top of commit 2056e2989bf4 ("x86/sgx: Fix NULL pointer
dereference on non-SGX systems") found on branch x86/sgx of the tip repo.

When I run the SGX selftests the new oversubscription test case is failing with
the error below:
./test_sgx
TAP version 13
1..6
# Starting 6 tests from 2 test cases.
#  RUN           enclave.unclobbered_vdso ...
#            OK  enclave.unclobbered_vdso
ok 1 enclave.unclobbered_vdso
#  RUN           enclave.unclobbered_vdso_oversubscribed ...
# main.c:330:unclobbered_vdso_oversubscribed:Expected (&self->run)->function (2) == EEXIT (4)
# main.c:330:unclobbered_vdso_oversubscribed:0x0e 0x06 0x00007f6000000fff
# main.c:338:unclobbered_vdso_oversubscribed:Expected get_op.value (0) == MAGIC (1234605616436508552)
# main.c:339:unclobbered_vdso_oversubscribed:Expected (&self->run)->function (2) == EEXIT (4)
# main.c:339:unclobbered_vdso_oversubscribed:0x0e 0x06 0x00007f6000000fff
# unclobbered_vdso_oversubscribed: Test failed at step #2
#          FAIL  enclave.unclobbered_vdso_oversubscribed
not ok 2 enclave.unclobbered_vdso_oversubscribed
#  RUN           enclave.clobbered_vdso ...
#            OK  enclave.clobbered_vdso
ok 3 enclave.clobbered_vdso
#  RUN           enclave.clobbered_vdso_and_user_function ...
#            OK  enclave.clobbered_vdso_and_user_function
ok 4 enclave.clobbered_vdso_and_user_function
#  RUN           enclave.tcs_entry ...
#            OK  enclave.tcs_entry
ok 5 enclave.tcs_entry
#  RUN           enclave.pte_permissions ...
#            OK  enclave.pte_permissions

The kernel logs also contain a splat that I have not encountered before:

------------[ cut here ]------------
ELDU returned 9 (0x9)
WARNING: CPU: 6 PID: 2470 at arch/x86/kernel/cpu/sgx/encl.c:77 sgx_encl_eldu+0x37c/0x3f0
Modules linked in: intel_rapl_msr intel_rapl_common i10nm_edac x86_pkg_temp_thermal ipmi_ssif coretemp kvm_intel kvm cmdlinepart intel_spi_pci intel_spi spi_nor ipmi_si mei_me ipmi_devintf input_leds irqbypass mtd mei ioatdma intel_pch_thermal wmi ipmi_msghandler acpi_power_meter iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi btrfs blake2b_generic zstd_compress raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx xor raid6_pq raid1 raid0 multipath linear ixgbe crct10dif_pclmul crc32_pclmul ghash_clmulni_intel hid_generic aesni_intel crypto_simd xfrm_algo usbhid cryptd ast dca hid mdio drm_vram_helper drm_ttm_helper
CPU: 6 PID: 2470 Comm: test_sgx Not tainted 5.16.0-rc1+ #24
Hardware name: Intel Corporation 
RIP: 0010:sgx_encl_eldu+0x37c/0x3f0
Code: 89 c2 48 c7 c6 e1 e9 3e 9b 48 c7 c7 e6 e9 3e 9b 44 89 95 54 ff ff ff 4c 89 85 58 ff ff ff c6 05 fc bd dd 01 01 e8 54 88 03 00 <0f> 0b 44 8b 95 54 ff ff ff 4c 8b 85 58 ff ff ff e9 46 fe ff ff 48
<snip>
Call Trace:
<TASK>
sgx_encl_load_page+0x82/0xc0
? sgx_encl_load_page+0x82/0xc0
sgx_vma_fault+0x40/0xe0
__do_fault+0x32/0x110
__handle_mm_fault+0xf84/0x1510
handle_mm_fault+0x13e/0x3f0
do_user_addr_fault+0x210/0x660
? rcu_read_lock_sched_held+0x4f/0x80
exc_page_fault+0x7b/0x270
? asm_exc_page_fault+0x8/0x30
asm_exc_page_fault+0x1e/0x30
RIP: 0033:0x7ffe7fdc3dba
<snip>

I ran the test on two systems and in both cases the test failed accompanied by
the kernel splat.

Reinette
Jarkko Sakkinen Jan. 14, 2022, 9:25 p.m. UTC | #2
On Wed, Jan 12, 2022 at 10:08:02PM -0800, Reinette Chatre wrote:
> Hi Jarkko,
> 
> On 1/8/2022 6:05 AM, Jarkko Sakkinen wrote:
> > There is a limited amount of SGX memory (EPC) on each system.  When that
> > memory is used up, SGX has its own swapping mechanism which is similar
> > in concept but totally separate from the core mm/* code.  Instead of
> > swapping to disk, SGX swaps from EPC to normal RAM.  That normal RAM
> > comes from a shared memory pseudo-file and can itself be swapped by the
> > core mm code.  There is a hierarchy like this:
> > 
> > 	EPC <-> shmem <-> disk
> > 
> > After data is swapped back in from shmem to EPC, the shmem backing
> > storage needs to be freed.  Currently, the backing shmem is not freed.
> > This effectively wastes the shmem while the enclave is running.  The
> > memory is recovered when the enclave is destroyed and the backing
> > storage freed.
> > 
> > Sort this out by freeing memory with shmem_truncate_range(), as soon as
> > a page is faulted back to the EPC.  In addition, free the memory for
> > PCMD pages as soon as all PCMD's in a page have been marked as unused
> > by zeroing its contents.
> > 
> > Reported-by: Dave Hansen <dave.hansen@linux.intel.com>
> > Cc: stable@vger.kernel.org
> > Fixes: 1728ab54b4be ("x86/sgx: Add a page reclaimer")
> > Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
> > ---
> > v3:
> > * Resend.
> > v2:
> > * Rewrite commit message as proposed by Dave.
> > * Truncate PCMD pages (Dave).
> > ---
> >  arch/x86/kernel/cpu/sgx/encl.c | 48 +++++++++++++++++++++++++++++++---
> >  1 file changed, 44 insertions(+), 4 deletions(-)
> > 
> > diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c
> > index 001808e3901c..ea43c10e5458 100644
> > --- a/arch/x86/kernel/cpu/sgx/encl.c
> > +++ b/arch/x86/kernel/cpu/sgx/encl.c
> > @@ -12,6 +12,27 @@
> >  #include "encls.h"
> >  #include "sgx.h"
> >  
> > +
> > +/*
> > + * Get the page number of the page in the backing storage, which stores the PCMD
> > + * of the enclave page in the given page index.  PCMD pages are located after
> > + * the backing storage for the visible enclave pages and SECS.
> > + */
> > +static inline pgoff_t sgx_encl_get_backing_pcmd_nr(struct sgx_encl *encl, pgoff_t index)
> > +{
> > +	return PFN_DOWN(encl->size) + 1 + (index / sizeof(struct sgx_pcmd));
> > +}
> > +
> > +/*
> > + * Free a page from the backing storage in the given page index.
> > + */
> > +static inline void sgx_encl_truncate_backing_page(struct sgx_encl *encl, pgoff_t index)
> > +{
> > +	struct inode *inode = file_inode(encl->backing);
> > +
> > +	shmem_truncate_range(inode, PFN_PHYS(index), PFN_PHYS(index) + PAGE_SIZE - 1);
> > +}
> > +
> >  /*
> >   * ELDU: Load an EPC page as unblocked. For more info, see "OS Management of EPC
> >   * Pages" in the SDM.
> > @@ -24,7 +45,10 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page,
> >  	struct sgx_encl *encl = encl_page->encl;
> >  	struct sgx_pageinfo pginfo;
> >  	struct sgx_backing b;
> > +	bool pcmd_page_empty;
> >  	pgoff_t page_index;
> > +	pgoff_t pcmd_index;
> > +	u8 *pcmd_page;
> >  	int ret;
> >  
> >  	if (secs_page)
> > @@ -38,8 +62,8 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page,
> >  
> >  	pginfo.addr = encl_page->desc & PAGE_MASK;
> >  	pginfo.contents = (unsigned long)kmap_atomic(b.contents);
> > -	pginfo.metadata = (unsigned long)kmap_atomic(b.pcmd) +
> > -			  b.pcmd_offset;
> > +	pcmd_page = kmap_atomic(b.pcmd);
> > +	pginfo.metadata = (unsigned long)pcmd_page + b.pcmd_offset;
> >  
> >  	if (secs_page)
> >  		pginfo.secs = (u64)sgx_get_epc_virt_addr(secs_page);
> > @@ -55,11 +79,27 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page,
> >  		ret = -EFAULT;
> >  	}
> >  
> > -	kunmap_atomic((void *)(unsigned long)(pginfo.metadata - b.pcmd_offset));
> > +	memset(pcmd_page + b.pcmd_offset, 0, sizeof(struct sgx_pcmd));
> > +
> > +	/*
> > +	 * The area for the PCMD in the page was zeroed above.  Check if the
> > +	 * whole page is now empty meaning that all PCMD's have been zeroed:
> > +	 */
> > +	pcmd_page_empty = !memchr_inv(pcmd_page, 0, PAGE_SIZE);
> > +
> > +	kunmap_atomic(pcmd_page);
> >  	kunmap_atomic((void *)(unsigned long)pginfo.contents);
> >  
> >  	sgx_encl_put_backing(&b, false);
> >  
> > +	/* Free the backing memory. */
> > +	sgx_encl_truncate_backing_page(encl, page_index);
> > +
> > +	if (pcmd_page_empty) {
> > +		pcmd_index = sgx_encl_get_backing_pcmd_nr(encl, page_index);
> > +		sgx_encl_truncate_backing_page(encl, pcmd_index);
> > +	}
> > +
> >  	return ret;
> >  }
> >  
> > @@ -577,7 +617,7 @@ static struct page *sgx_encl_get_backing_page(struct sgx_encl *encl,
> >  int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index,
> >  			 struct sgx_backing *backing)
> >  {
> > -	pgoff_t pcmd_index = PFN_DOWN(encl->size) + 1 + (page_index >> 5);
> > +	pgoff_t pcmd_index = sgx_encl_get_backing_pcmd_nr(encl, page_index);
> >  	struct page *contents;
> >  	struct page *pcmd;
> >  
> 
> I applied this patch on top of commit 2056e2989bf4 ("x86/sgx: Fix NULL pointer
> dereference on non-SGX systems") found on branch x86/sgx of the tip repo.
> 
> When I run the SGX selftests the new oversubscription test case is failing with
> the error below:
> ./test_sgx
> TAP version 13
> 1..6
> # Starting 6 tests from 2 test cases.
> #  RUN           enclave.unclobbered_vdso ...
> #            OK  enclave.unclobbered_vdso
> ok 1 enclave.unclobbered_vdso
> #  RUN           enclave.unclobbered_vdso_oversubscribed ...
> # main.c:330:unclobbered_vdso_oversubscribed:Expected (&self->run)->function (2) == EEXIT (4)
> # main.c:330:unclobbered_vdso_oversubscribed:0x0e 0x06 0x00007f6000000fff
> # main.c:338:unclobbered_vdso_oversubscribed:Expected get_op.value (0) == MAGIC (1234605616436508552)
> # main.c:339:unclobbered_vdso_oversubscribed:Expected (&self->run)->function (2) == EEXIT (4)
> # main.c:339:unclobbered_vdso_oversubscribed:0x0e 0x06 0x00007f6000000fff
> # unclobbered_vdso_oversubscribed: Test failed at step #2
> #          FAIL  enclave.unclobbered_vdso_oversubscribed
> not ok 2 enclave.unclobbered_vdso_oversubscribed
> #  RUN           enclave.clobbered_vdso ...
> #            OK  enclave.clobbered_vdso
> ok 3 enclave.clobbered_vdso
> #  RUN           enclave.clobbered_vdso_and_user_function ...
> #            OK  enclave.clobbered_vdso_and_user_function
> ok 4 enclave.clobbered_vdso_and_user_function
> #  RUN           enclave.tcs_entry ...
> #            OK  enclave.tcs_entry
> ok 5 enclave.tcs_entry
> #  RUN           enclave.pte_permissions ...
> #            OK  enclave.pte_permissions
> 
> The kernel logs also contain a splat that I have not encountered before:
> 
> ------------[ cut here ]------------
> ELDU returned 9 (0x9)
> WARNING: CPU: 6 PID: 2470 at arch/x86/kernel/cpu/sgx/encl.c:77 sgx_encl_eldu+0x37c/0x3f0
> Modules linked in: intel_rapl_msr intel_rapl_common i10nm_edac x86_pkg_temp_thermal ipmi_ssif coretemp kvm_intel kvm cmdlinepart intel_spi_pci intel_spi spi_nor ipmi_si mei_me ipmi_devintf input_leds irqbypass mtd mei ioatdma intel_pch_thermal wmi ipmi_msghandler acpi_power_meter iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi btrfs blake2b_generic zstd_compress raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx xor raid6_pq raid1 raid0 multipath linear ixgbe crct10dif_pclmul crc32_pclmul ghash_clmulni_intel hid_generic aesni_intel crypto_simd xfrm_algo usbhid cryptd ast dca hid mdio drm_vram_helper drm_ttm_helper
> CPU: 6 PID: 2470 Comm: test_sgx Not tainted 5.16.0-rc1+ #24
> Hardware name: Intel Corporation 
> RIP: 0010:sgx_encl_eldu+0x37c/0x3f0
> Code: 89 c2 48 c7 c6 e1 e9 3e 9b 48 c7 c7 e6 e9 3e 9b 44 89 95 54 ff ff ff 4c 89 85 58 ff ff ff c6 05 fc bd dd 01 01 e8 54 88 03 00 <0f> 0b 44 8b 95 54 ff ff ff 4c 8b 85 58 ff ff ff e9 46 fe ff ff 48
> <snip>
> Call Trace:
> <TASK>
> sgx_encl_load_page+0x82/0xc0
> ? sgx_encl_load_page+0x82/0xc0
> sgx_vma_fault+0x40/0xe0
> __do_fault+0x32/0x110
> __handle_mm_fault+0xf84/0x1510
> handle_mm_fault+0x13e/0x3f0
> do_user_addr_fault+0x210/0x660
> ? rcu_read_lock_sched_held+0x4f/0x80
> exc_page_fault+0x7b/0x270
> ? asm_exc_page_fault+0x8/0x30
> asm_exc_page_fault+0x1e/0x30
> RIP: 0033:0x7ffe7fdc3dba
> <snip>
> 
> I ran the test on two systems and in both cases the test failed accompanied by
> the kernel splat.
> 
> Reinette

Thank you for testing this.

I did not get any errors when I run kselftest at the time *but* it was
exactly two months ago (2021-11-11). I cannot recall whether this test
was already in at the time, or did I run the overcommit test out-of-tree,
or if some confliciting non-kselftest changes have been applied.

I'll do the backtracking when I have the time by doing git bisect between
2021-11-11 x86/sgx and the current one.

/Jarkko
Jarkko Sakkinen Jan. 17, 2022, 8:30 a.m. UTC | #3
On Fri, Jan 14, 2022 at 11:25:47PM +0200, Jarkko Sakkinen wrote:
> On Wed, Jan 12, 2022 at 10:08:02PM -0800, Reinette Chatre wrote:
> > Hi Jarkko,
> > 
> > On 1/8/2022 6:05 AM, Jarkko Sakkinen wrote:
> > > There is a limited amount of SGX memory (EPC) on each system.  When that
> > > memory is used up, SGX has its own swapping mechanism which is similar
> > > in concept but totally separate from the core mm/* code.  Instead of
> > > swapping to disk, SGX swaps from EPC to normal RAM.  That normal RAM
> > > comes from a shared memory pseudo-file and can itself be swapped by the
> > > core mm code.  There is a hierarchy like this:
> > > 
> > > 	EPC <-> shmem <-> disk
> > > 
> > > After data is swapped back in from shmem to EPC, the shmem backing
> > > storage needs to be freed.  Currently, the backing shmem is not freed.
> > > This effectively wastes the shmem while the enclave is running.  The
> > > memory is recovered when the enclave is destroyed and the backing
> > > storage freed.
> > > 
> > > Sort this out by freeing memory with shmem_truncate_range(), as soon as
> > > a page is faulted back to the EPC.  In addition, free the memory for
> > > PCMD pages as soon as all PCMD's in a page have been marked as unused
> > > by zeroing its contents.
> > > 
> > > Reported-by: Dave Hansen <dave.hansen@linux.intel.com>
> > > Cc: stable@vger.kernel.org
> > > Fixes: 1728ab54b4be ("x86/sgx: Add a page reclaimer")
> > > Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
> > > ---
> > > v3:
> > > * Resend.
> > > v2:
> > > * Rewrite commit message as proposed by Dave.
> > > * Truncate PCMD pages (Dave).
> > > ---
> > >  arch/x86/kernel/cpu/sgx/encl.c | 48 +++++++++++++++++++++++++++++++---
> > >  1 file changed, 44 insertions(+), 4 deletions(-)
> > > 
> > > diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c
> > > index 001808e3901c..ea43c10e5458 100644
> > > --- a/arch/x86/kernel/cpu/sgx/encl.c
> > > +++ b/arch/x86/kernel/cpu/sgx/encl.c
> > > @@ -12,6 +12,27 @@
> > >  #include "encls.h"
> > >  #include "sgx.h"
> > >  
> > > +
> > > +/*
> > > + * Get the page number of the page in the backing storage, which stores the PCMD
> > > + * of the enclave page in the given page index.  PCMD pages are located after
> > > + * the backing storage for the visible enclave pages and SECS.
> > > + */
> > > +static inline pgoff_t sgx_encl_get_backing_pcmd_nr(struct sgx_encl *encl, pgoff_t index)
> > > +{
> > > +	return PFN_DOWN(encl->size) + 1 + (index / sizeof(struct sgx_pcmd));
> > > +}
> > > +
> > > +/*
> > > + * Free a page from the backing storage in the given page index.
> > > + */
> > > +static inline void sgx_encl_truncate_backing_page(struct sgx_encl *encl, pgoff_t index)
> > > +{
> > > +	struct inode *inode = file_inode(encl->backing);
> > > +
> > > +	shmem_truncate_range(inode, PFN_PHYS(index), PFN_PHYS(index) + PAGE_SIZE - 1);
> > > +}
> > > +
> > >  /*
> > >   * ELDU: Load an EPC page as unblocked. For more info, see "OS Management of EPC
> > >   * Pages" in the SDM.
> > > @@ -24,7 +45,10 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page,
> > >  	struct sgx_encl *encl = encl_page->encl;
> > >  	struct sgx_pageinfo pginfo;
> > >  	struct sgx_backing b;
> > > +	bool pcmd_page_empty;
> > >  	pgoff_t page_index;
> > > +	pgoff_t pcmd_index;
> > > +	u8 *pcmd_page;
> > >  	int ret;
> > >  
> > >  	if (secs_page)
> > > @@ -38,8 +62,8 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page,
> > >  
> > >  	pginfo.addr = encl_page->desc & PAGE_MASK;
> > >  	pginfo.contents = (unsigned long)kmap_atomic(b.contents);
> > > -	pginfo.metadata = (unsigned long)kmap_atomic(b.pcmd) +
> > > -			  b.pcmd_offset;
> > > +	pcmd_page = kmap_atomic(b.pcmd);
> > > +	pginfo.metadata = (unsigned long)pcmd_page + b.pcmd_offset;
> > >  
> > >  	if (secs_page)
> > >  		pginfo.secs = (u64)sgx_get_epc_virt_addr(secs_page);
> > > @@ -55,11 +79,27 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page,
> > >  		ret = -EFAULT;
> > >  	}
> > >  
> > > -	kunmap_atomic((void *)(unsigned long)(pginfo.metadata - b.pcmd_offset));
> > > +	memset(pcmd_page + b.pcmd_offset, 0, sizeof(struct sgx_pcmd));
> > > +
> > > +	/*
> > > +	 * The area for the PCMD in the page was zeroed above.  Check if the
> > > +	 * whole page is now empty meaning that all PCMD's have been zeroed:
> > > +	 */
> > > +	pcmd_page_empty = !memchr_inv(pcmd_page, 0, PAGE_SIZE);
> > > +
> > > +	kunmap_atomic(pcmd_page);
> > >  	kunmap_atomic((void *)(unsigned long)pginfo.contents);
> > >  
> > >  	sgx_encl_put_backing(&b, false);
> > >  
> > > +	/* Free the backing memory. */
> > > +	sgx_encl_truncate_backing_page(encl, page_index);
> > > +
> > > +	if (pcmd_page_empty) {
> > > +		pcmd_index = sgx_encl_get_backing_pcmd_nr(encl, page_index);
> > > +		sgx_encl_truncate_backing_page(encl, pcmd_index);
> > > +	}
> > > +
> > >  	return ret;
> > >  }
> > >  
> > > @@ -577,7 +617,7 @@ static struct page *sgx_encl_get_backing_page(struct sgx_encl *encl,
> > >  int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index,
> > >  			 struct sgx_backing *backing)
> > >  {
> > > -	pgoff_t pcmd_index = PFN_DOWN(encl->size) + 1 + (page_index >> 5);
> > > +	pgoff_t pcmd_index = sgx_encl_get_backing_pcmd_nr(encl, page_index);
> > >  	struct page *contents;
> > >  	struct page *pcmd;
> > >  
> > 
> > I applied this patch on top of commit 2056e2989bf4 ("x86/sgx: Fix NULL pointer
> > dereference on non-SGX systems") found on branch x86/sgx of the tip repo.
> > 
> > When I run the SGX selftests the new oversubscription test case is failing with
> > the error below:
> > ./test_sgx
> > TAP version 13
> > 1..6
> > # Starting 6 tests from 2 test cases.
> > #  RUN           enclave.unclobbered_vdso ...
> > #            OK  enclave.unclobbered_vdso
> > ok 1 enclave.unclobbered_vdso
> > #  RUN           enclave.unclobbered_vdso_oversubscribed ...
> > # main.c:330:unclobbered_vdso_oversubscribed:Expected (&self->run)->function (2) == EEXIT (4)
> > # main.c:330:unclobbered_vdso_oversubscribed:0x0e 0x06 0x00007f6000000fff
> > # main.c:338:unclobbered_vdso_oversubscribed:Expected get_op.value (0) == MAGIC (1234605616436508552)
> > # main.c:339:unclobbered_vdso_oversubscribed:Expected (&self->run)->function (2) == EEXIT (4)
> > # main.c:339:unclobbered_vdso_oversubscribed:0x0e 0x06 0x00007f6000000fff
> > # unclobbered_vdso_oversubscribed: Test failed at step #2
> > #          FAIL  enclave.unclobbered_vdso_oversubscribed
> > not ok 2 enclave.unclobbered_vdso_oversubscribed
> > #  RUN           enclave.clobbered_vdso ...
> > #            OK  enclave.clobbered_vdso
> > ok 3 enclave.clobbered_vdso
> > #  RUN           enclave.clobbered_vdso_and_user_function ...
> > #            OK  enclave.clobbered_vdso_and_user_function
> > ok 4 enclave.clobbered_vdso_and_user_function
> > #  RUN           enclave.tcs_entry ...
> > #            OK  enclave.tcs_entry
> > ok 5 enclave.tcs_entry
> > #  RUN           enclave.pte_permissions ...
> > #            OK  enclave.pte_permissions
> > 
> > The kernel logs also contain a splat that I have not encountered before:
> > 
> > ------------[ cut here ]------------
> > ELDU returned 9 (0x9)
> > WARNING: CPU: 6 PID: 2470 at arch/x86/kernel/cpu/sgx/encl.c:77 sgx_encl_eldu+0x37c/0x3f0
> > Modules linked in: intel_rapl_msr intel_rapl_common i10nm_edac x86_pkg_temp_thermal ipmi_ssif coretemp kvm_intel kvm cmdlinepart intel_spi_pci intel_spi spi_nor ipmi_si mei_me ipmi_devintf input_leds irqbypass mtd mei ioatdma intel_pch_thermal wmi ipmi_msghandler acpi_power_meter iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi btrfs blake2b_generic zstd_compress raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx xor raid6_pq raid1 raid0 multipath linear ixgbe crct10dif_pclmul crc32_pclmul ghash_clmulni_intel hid_generic aesni_intel crypto_simd xfrm_algo usbhid cryptd ast dca hid mdio drm_vram_helper drm_ttm_helper
> > CPU: 6 PID: 2470 Comm: test_sgx Not tainted 5.16.0-rc1+ #24
> > Hardware name: Intel Corporation 
> > RIP: 0010:sgx_encl_eldu+0x37c/0x3f0
> > Code: 89 c2 48 c7 c6 e1 e9 3e 9b 48 c7 c7 e6 e9 3e 9b 44 89 95 54 ff ff ff 4c 89 85 58 ff ff ff c6 05 fc bd dd 01 01 e8 54 88 03 00 <0f> 0b 44 8b 95 54 ff ff ff 4c 8b 85 58 ff ff ff e9 46 fe ff ff 48
> > <snip>
> > Call Trace:
> > <TASK>
> > sgx_encl_load_page+0x82/0xc0
> > ? sgx_encl_load_page+0x82/0xc0
> > sgx_vma_fault+0x40/0xe0
> > __do_fault+0x32/0x110
> > __handle_mm_fault+0xf84/0x1510
> > handle_mm_fault+0x13e/0x3f0
> > do_user_addr_fault+0x210/0x660
> > ? rcu_read_lock_sched_held+0x4f/0x80
> > exc_page_fault+0x7b/0x270
> > ? asm_exc_page_fault+0x8/0x30
> > asm_exc_page_fault+0x1e/0x30
> > RIP: 0033:0x7ffe7fdc3dba
> > <snip>
> > 
> > I ran the test on two systems and in both cases the test failed accompanied by
> > the kernel splat.
> > 
> > Reinette
> 
> Thank you for testing this.
> 
> I did not get any errors when I run kselftest at the time *but* it was
> exactly two months ago (2021-11-11). I cannot recall whether this test
> was already in at the time, or did I run the overcommit test out-of-tree,
> or if some confliciting non-kselftest changes have been applied.
> 
> I'll do the backtracking when I have the time by doing git bisect between
> 2021-11-11 x86/sgx and the current one.

Yep, I think I get the exact same result with tip/x86/sgx.

$ ./test_sgx 
TAP version 13
1..6
# Starting 6 tests from 2 test cases.
#  RUN           enclave.unclobbered_vdso ...
#            OK  enclave.unclobbered_vdso
ok 1 enclave.unclobbered_vdso
#  RUN           enclave.unclobbered_vdso_oversubscribed ...
# main.c:330:unclobbered_vdso_oversubscribed:Expected (&self->run)->function (2) == EEXIT (4)
# main.c:330:unclobbered_vdso_oversubscribed:0x0e 0x06 0x00007f3160000fff
# main.c:338:unclobbered_vdso_oversubscribed:Expected get_op.value (0) == MAGIC (1234605616436508552)
# main.c:339:unclobbered_vdso_oversubscribed:Expected (&self->run)->function (2) == EEXIT (4)
# main.c:339:unclobbered_vdso_oversubscribed:0x0e 0x06 0x00007f3160000fff
# unclobbered_vdso_oversubscribed: Test failed at step #2
#          FAIL  enclave.unclobbered_vdso_oversubscribed
not ok 2 enclave.unclobbered_vdso_oversubscribed
#  RUN           enclave.clobbered_vdso ...
#            OK  enclave.clobbered_vdso
ok 3 enclave.clobbered_vdso
#  RUN           enclave.clobbered_vdso_and_user_function ...
#            OK  enclave.clobbered_vdso_and_user_function
ok 4 enclave.clobbered_vdso_and_user_function
#  RUN           enclave.tcs_entry ...
#            OK  enclave.tcs_entry
ok 5 enclave.tcs_entry
#  RUN           enclave.pte_permissions ...
#            OK  enclave.pte_permissions
ok 6 enclave.pte_permissions
# FAILED: 5 / 6 tests passed.
# Totals: pass:5 fail:1 xfail:0 xpass:0 skip:0 error:0

And dmesg output:

[ 4267.158920] ------------[ cut here ]------------
[ 4267.158923] ELDU returned 9 (0x9)
[ 4267.158936] WARNING: CPU: 1 PID: 1343 at arch/x86/kernel/cpu/sgx/encl.c:77 sgx_encl_eldu+0x3f7/0x420
[ 4267.158945] Modules linked in: cfg80211 rfkill ccm algif_aead des_generic libdes ecb algif_skcipher cmac md4 algif_hash af_alg intel_rapl_msr intel_rapl_common kvm_intel kvm snd_hda_codec_generic ledtrig_audio irqbypass snd_hda_intel rapl snd_intel_dspcfg snd_intel_sdw_acpi snd_hda_codec iTCO_wdt intel_pmc_bxt iTCO_vendor_support snd_hda_core psmouse vfat snd_hwdep fat snd_pcm intel_agp i2c_i801 intel_gtt pcspkr joydev i2c_smbus snd_timer agpgart mousedev ext4 snd lpc_ich soundcore mac_hid qemu_fw_cfg crc16 mbcache jbd2 pkcs8_key_parser fuse ip_tables x_tables btrfs blake2b_generic libcrc32c crc32c_generic xor raid6_pq dm_crypt cbc encrypted_keys trusted asn1_encoder tee usbhid dm_mod virtio_gpu virtio_dma_buf drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops virtio_rng cec virtio_balloon virtio_blk virtio_console drm virtio_net net_failover failover crct10dif_pclmul crc32_pclmul crc32c_intel ghash_clmulni_intel aesni_intel crypto_simd cryptd tpm_crb tpm_tis serio_raw
[ 4267.159078]  tpm_tis_core xhci_pci xhci_pci_renesas tpm virtio_pci virtio_pci_legacy_dev virtio_pci_modern_dev rng_core
[ 4267.159089] CPU: 1 PID: 1343 Comm: test_sgx Not tainted 5.16.0-rc1-1-sgx-g142746670045 #1
[ 4267.159092] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015
[ 4267.159095] RIP: 0010:sgx_encl_eldu+0x3f7/0x420
[ 4267.159098] Code: ff ff ff 89 c1 89 c2 48 c7 c6 03 06 d6 9c 4c 89 0c 24 48 c7 c7 08 06 d6 9c 44 89 54 24 08 c6 05 5f 45 b6 01 01 e8 d2 78 a6 00 <0f> 0b 4c 8b 0c 24 44 8b 54 24 08 e9 36 fe ff ff e8 5b 8b fa ff e9
[ 4267.159099] RSP: 0000:ffffb5314074bcc0 EFLAGS: 00010282
[ 4267.159100] RAX: 0000000000000000 RBX: ffffb5314074bce0 RCX: 0000000000000027
[ 4267.159101] RDX: ffff8bd9f7d20728 RSI: 0000000000000001 RDI: ffff8bd9f7d20720
[ 4267.159102] RBP: ffffb5314074bd70 R08: 0000000000000000 R09: ffffb5314074baf0
[ 4267.159103] R10: ffffb5314074bae8 R11: ffffffff9d4cbd28 R12: ffff8bd9a3ca18c0
[ 4267.159104] R13: 0000000000000000 R14: ffff8bd882139000 R15: ffffb531403a9420
[ 4267.159105] FS:  00007f3163e48c00(0000) GS:ffff8bd9f7d00000(0000) knlGS:0000000000000000
[ 4267.159106] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 4267.159107] CR2: 00007f3160000fff CR3: 00000001a2046001 CR4: 0000000000370ee0
[ 4267.159111] Call Trace:
[ 4267.159116]  <TASK>
[ 4267.159124]  sgx_encl_load_page+0x73/0xb0
[ 4267.159126]  sgx_vma_fault+0x3a/0xd0
[ 4267.159127]  __do_fault+0x36/0xd0
[ 4267.159132]  __handle_mm_fault+0xd4e/0x1540
[ 4267.159135]  handle_mm_fault+0xb2/0x280
[ 4267.159137]  do_user_addr_fault+0x1ba/0x690
[ 4267.159140]  exc_page_fault+0x72/0x170
[ 4267.159144]  ? asm_exc_page_fault+0x8/0x30
[ 4267.159147]  asm_exc_page_fault+0x1e/0x30
[ 4267.159150] RIP: 0033:0x7ffe771c4c8a
[ 4267.159153] Code: 43 48 8b 4d 10 48 c7 c3 28 00 00 00 48 83 3c 19 00 75 31 48 83 c3 08 48 81 fb 00 01 00 00 75 ec 48 8b 19 48 8d 0d 00 00 00 00 <0f> 01 d7 48 8b 5d 10 c7 43 08 04 00 00 00 48 83 7b 18 00 75 21 31
[ 4267.159154] RSP: 002b:00007ffe7719ce88 EFLAGS: 00010246
[ 4267.159155] RAX: 0000000000000002 RBX: 00007f3160000000 RCX: 00007ffe771c4c8a
[ 4267.159156] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 00007ffe7719d020
[ 4267.159157] RBP: 00007ffe7719ce90 R08: 0000000000000000 R09: 0000000000000000
[ 4267.159158] R10: 00007f3163e86bb0 R11: 00007f3163fcd9f0 R12: 000055c4ac58a410
[ 4267.159158] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
[ 4267.159162]  </TASK>
[ 4267.159163] ---[ end trace 1b44544248db3939 ]---
[ 5395.692320] audit: type=1100 audit(1642407877.716:143): pid=1397 uid=1000 auid=1000 ses=3 msg='op=PAM:authentication grantors=? acct="jarkko" exe="/usr/bin/sudo" hostname=? addr=? terminal=/dev/pts/0 res=failed'
[ 5400.216126] audit: type=1100 audit(1642407882.243:144): pid=1397 uid=1000 auid=1000 ses=3 msg='op=PAM:authentication grantors=pam_faillock,pam_permit,pam_faillock acct="jarkko" exe="/usr/bin/sudo" hostname=? addr=? terminal=/dev/pts/0 res=success'
[ 5400.216289] audit: type=1101 audit(1642407882.243:145): pid=1397 uid=1000 auid=1000 ses=3 msg='op=PAM:accounting grantors=pam_unix,pam_permit,pam_time acct="jarkko" exe="/usr/bin/sudo" hostname=? addr=? terminal=/dev/pts/0 res=success'
[ 5400.217010] audit: type=1110 audit(1642407882.243:146): pid=1397 uid=1000 auid=1000 ses=3 msg='op=PAM:setcred grantors=pam_faillock,pam_permit,pam_faillock acct="root" exe="/usr/bin/sudo" hostname=? addr=? terminal=/dev/pts/0 res=success'
[ 5400.219633] audit: type=1105 audit(1642407882.246:147): pid=1397 uid=1000 auid=1000 ses=3 msg='op=PAM:session_open grantors=pam_systemd_home,pam_limits,pam_unix,pam_permit acct="root" exe="/usr/bin/sudo" hostname=? addr=? terminal=/dev/pts/0 res=success'

This was run in QEMU.

/Jarkko
Dave Hansen Jan. 24, 2022, 5:17 p.m. UTC | #4
On 1/12/22 22:08, Reinette Chatre wrote:
> ------------[ cut here ]------------
> ELDU returned 9 (0x9)
> WARNING: CPU: 6 PID: 2470 at arch/x86/kernel/cpu/sgx/encl.c:77 sgx_encl_eldu+0x37c/0x3f0

Could we flesh out that error message, please?

That "return" is a code from "Information and Error Codes" in the SGX 
SDM chapter.  It's also spelled out in 'enum sgx_return_code'.  So, at 
least, this should probably say:

	ELDU sgx_return_code: 9 (0x9)

*That* is at least greppable.
Jarkko Sakkinen Jan. 26, 2022, 1:31 p.m. UTC | #5
On Mon, Jan 24, 2022 at 09:17:32AM -0800, Dave Hansen wrote:
> On 1/12/22 22:08, Reinette Chatre wrote:
> > ------------[ cut here ]------------
> > ELDU returned 9 (0x9)
> > WARNING: CPU: 6 PID: 2470 at arch/x86/kernel/cpu/sgx/encl.c:77 sgx_encl_eldu+0x37c/0x3f0
> 
> Could we flesh out that error message, please?
> 
> That "return" is a code from "Information and Error Codes" in the SGX SDM
> chapter.  It's also spelled out in 'enum sgx_return_code'.  So, at least,
> this should probably say:
> 
> 	ELDU sgx_return_code: 9 (0x9)
> 
> *That* is at least greppable.

Yes, I'm all for adding sgx_ prefix to everything but should it rather
be "sgx: ELDU ..." like most of everything else?

/Jarkko
Jarkko Sakkinen Jan. 27, 2022, 7:01 a.m. UTC | #6
On Sat, Jan 08, 2022 at 04:05:10PM +0200, Jarkko Sakkinen wrote:
> +static inline pgoff_t sgx_encl_get_backing_pcmd_nr(struct sgx_encl *encl, pgoff_t index)
> +{
> +	return PFN_DOWN(encl->size) + 1 + (index / sizeof(struct sgx_pcmd));
> +}

Found it.

This should be 

static inline pgoff_t sgx_encl_get_backing_pcmd_nr(struct sgx_encl *encl, pgoff_t index)
{
	return PFN_DOWN(encl->size) + 1 + (index * PAGE_SIZE) / sizeof(struct sgx_pcmd);
}

/Jarkko
Dave Hansen Jan. 31, 2022, 11:30 p.m. UTC | #7
On 1/26/22 23:01, Jarkko Sakkinen wrote:
> On Sat, Jan 08, 2022 at 04:05:10PM +0200, Jarkko Sakkinen wrote:
>> +static inline pgoff_t sgx_encl_get_backing_pcmd_nr(struct sgx_encl *encl, pgoff_t index)
>> +{
>> +	return PFN_DOWN(encl->size) + 1 + (index / sizeof(struct sgx_pcmd));
>> +}
> Found it.
> 
> This should be 
> 
> static inline pgoff_t sgx_encl_get_backing_pcmd_nr(struct sgx_encl *encl, pgoff_t index)
> {
> 	return PFN_DOWN(encl->size) + 1 + (index * PAGE_SIZE) / sizeof(struct sgx_pcmd);
> }

I've looked at this for about 10 minutes and been simultaneously
confused as to whether it is right or wrong.  That makes it
automatically wrong. :)

First, this isn't calculating a "PCMD number".  It's calculating backing
offset.  The "PCMD number" calculation is only a part of it.  I think
that makes the naming rather sloppy.

Second, I think the typing is sloppy.  page_index for example:

> static int __sgx_encl_eldu(struct sgx_encl_page *encl_page,
>                            struct sgx_epc_page *epc_page,
>                            struct sgx_epc_page *secs_page)
> {
...
>         pgoff_t page_index;

It's storing a page number:

                page_index = PFN_DOWN(encl->size);

not a real offset-into-a-file.  That makes it even more confusing when
'page_index' crosses a function boundary, gets renamed to 'index' and
then its units get confused.

/*
 * Given a page number within an enclave (@epc_page_nr), calculate the
 * offset in bytes into the backing file where that page's PCMD is
 * located.
 */
-static inline pgoff_t sgx_encl_get_backing_pcmd_nr(struct sgx_encl
*encl, pgoff_t index)
+static inline pgoff_t sgx_page_nr_to_pcmd_nr(struct sgx_encl *encl,
unsigned long epc_page_nr)
{
	pgoff_t last_epc_offset = PFN_DOWN(encl->size);
	pgoff_t pcmd_offset;

	// The SECS page is stashed in a slot after the
	// last normal EPC page.  Leave space for it:
	last_epc_offset++;

	pcmd_offset = epc_page_nr / sizeof(struct sgx_pcmd);

	return last_epc_offset + pcmd_offset;
}

Looking at that, I still think your *original* version is correct.

Am I just all twisted around from looking at this code too much?  Could
you please take another look and send a new version of the patch?
Jarkko Sakkinen Feb. 20, 2022, 6:52 p.m. UTC | #8
On Mon, Jan 31, 2022 at 03:30:52PM -0800, Dave Hansen wrote:
> On 1/26/22 23:01, Jarkko Sakkinen wrote:
> > On Sat, Jan 08, 2022 at 04:05:10PM +0200, Jarkko Sakkinen wrote:
> >> +static inline pgoff_t sgx_encl_get_backing_pcmd_nr(struct sgx_encl *encl, pgoff_t index)
> >> +{
> >> +	return PFN_DOWN(encl->size) + 1 + (index / sizeof(struct sgx_pcmd));
> >> +}
> > Found it.
> > 
> > This should be 
> > 
> > static inline pgoff_t sgx_encl_get_backing_pcmd_nr(struct sgx_encl *encl, pgoff_t index)
> > {
> > 	return PFN_DOWN(encl->size) + 1 + (index * PAGE_SIZE) / sizeof(struct sgx_pcmd);
> > }
> 
> I've looked at this for about 10 minutes and been simultaneously
> confused as to whether it is right or wrong.  That makes it
> automatically wrong. :)
> 
> First, this isn't calculating a "PCMD number".  It's calculating backing
> offset.  The "PCMD number" calculation is only a part of it.  I think
> that makes the naming rather sloppy.
> 
> Second, I think the typing is sloppy.  page_index for example:
> 
> > static int __sgx_encl_eldu(struct sgx_encl_page *encl_page,
> >                            struct sgx_epc_page *epc_page,
> >                            struct sgx_epc_page *secs_page)
> > {
> ...
> >         pgoff_t page_index;
> 
> It's storing a page number:
> 
>                 page_index = PFN_DOWN(encl->size);
> 
> not a real offset-into-a-file.  That makes it even more confusing when
> 'page_index' crosses a function boundary, gets renamed to 'index' and
> then its units get confused.
> 
> /*
>  * Given a page number within an enclave (@epc_page_nr), calculate the
>  * offset in bytes into the backing file where that page's PCMD is
>  * located.
>  */
> -static inline pgoff_t sgx_encl_get_backing_pcmd_nr(struct sgx_encl
> *encl, pgoff_t index)
> +static inline pgoff_t sgx_page_nr_to_pcmd_nr(struct sgx_encl *encl,
> unsigned long epc_page_nr)
> {
> 	pgoff_t last_epc_offset = PFN_DOWN(encl->size);
> 	pgoff_t pcmd_offset;
> 
> 	// The SECS page is stashed in a slot after the
> 	// last normal EPC page.  Leave space for it:
> 	last_epc_offset++;
> 
> 	pcmd_offset = epc_page_nr / sizeof(struct sgx_pcmd);
> 
> 	return last_epc_offset + pcmd_offset;
> }
> 
> Looking at that, I still think your *original* version is correct.
> 
> Am I just all twisted around from looking at this code too much?  Could
> you please take another look and send a new version of the patch?

Yeah, I will. It took me two weeks to make full remote attestation
implementation for SGX in Rust but at least I can confirm that all
the kernel bits work on that (and this will also help me to finish
the man page because it is pretty nasty to document it if you haven't
gone to the rabbit hole yourself).

I'll prioritize now to get SGX2 patches tested with Enarx implementation
but this 2nd thing in my kernel queue.

/Jarkko
Dave Hansen Feb. 20, 2022, 6:58 p.m. UTC | #9
On 2/20/22 10:52, Jarkko Sakkinen wrote:
>> Am I just all twisted around from looking at this code too much?  Could
>> you please take another look and send a new version of the patch?
> Yeah, I will. It took me two weeks to make full remote attestation
> implementation for SGX in Rust but at least I can confirm that all
> the kernel bits work on that (and this will also help me to finish
> the man page because it is pretty nasty to document it if you haven't
> gone to the rabbit hole yourself).
> 
> I'll prioritize now to get SGX2 patches tested with Enarx implementation
> but this 2nd thing in my kernel queue.

I really prioritize bug fixes over new features.  New features can's
really get merged until this gets fixed up.
Jarkko Sakkinen Feb. 20, 2022, 8:09 p.m. UTC | #10
On Sun, Feb 20, 2022 at 10:58:00AM -0800, Dave Hansen wrote:
> On 2/20/22 10:52, Jarkko Sakkinen wrote:
> >> Am I just all twisted around from looking at this code too much?  Could
> >> you please take another look and send a new version of the patch?
> > Yeah, I will. It took me two weeks to make full remote attestation
> > implementation for SGX in Rust but at least I can confirm that all
> > the kernel bits work on that (and this will also help me to finish
> > the man page because it is pretty nasty to document it if you haven't
> > gone to the rabbit hole yourself).
> > 
> > I'll prioritize now to get SGX2 patches tested with Enarx implementation
> > but this 2nd thing in my kernel queue.
> 
> I really prioritize bug fixes over new features.  New features can's
> really get merged until this gets fixed up.

Fair enough. In this case I was not exactly sure whether it is categorized
as bug or enhancement. Should I also put a fixes tag to this to the
original reclaimer code (since it has been like this since the epoch)?

BR, Jarkko
Jarkko Sakkinen Feb. 20, 2022, 8:42 p.m. UTC | #11
On Sun, Feb 20, 2022 at 09:09:27PM +0100, Jarkko Sakkinen wrote:
> On Sun, Feb 20, 2022 at 10:58:00AM -0800, Dave Hansen wrote:
> > On 2/20/22 10:52, Jarkko Sakkinen wrote:
> > >> Am I just all twisted around from looking at this code too much?  Could
> > >> you please take another look and send a new version of the patch?
> > > Yeah, I will. It took me two weeks to make full remote attestation
> > > implementation for SGX in Rust but at least I can confirm that all
> > > the kernel bits work on that (and this will also help me to finish
> > > the man page because it is pretty nasty to document it if you haven't
> > > gone to the rabbit hole yourself).
> > > 
> > > I'll prioritize now to get SGX2 patches tested with Enarx implementation
> > > but this 2nd thing in my kernel queue.
> > 
> > I really prioritize bug fixes over new features.  New features can's
> > really get merged until this gets fixed up.
> 
> Fair enough. In this case I was not exactly sure whether it is categorized
> as bug or enhancement. Should I also put a fixes tag to this to the
> original reclaimer code (since it has been like this since the epoch)?

I'll keep the fixes tag, and send a new version of the patch, as soon
as I have tested it (working on it right now).

BR, Jarkko
Jarkko Sakkinen Feb. 22, 2022, 12:15 a.m. UTC | #12
On Mon, Jan 31, 2022 at 03:30:52PM -0800, Dave Hansen wrote:
> On 1/26/22 23:01, Jarkko Sakkinen wrote:
> > On Sat, Jan 08, 2022 at 04:05:10PM +0200, Jarkko Sakkinen wrote:
> >> +static inline pgoff_t sgx_encl_get_backing_pcmd_nr(struct sgx_encl *encl, pgoff_t index)
> >> +{
> >> +	return PFN_DOWN(encl->size) + 1 + (index / sizeof(struct sgx_pcmd));
> >> +}
> > Found it.
> > 
> > This should be 
> > 
> > static inline pgoff_t sgx_encl_get_backing_pcmd_nr(struct sgx_encl *encl, pgoff_t index)
> > {
> > 	return PFN_DOWN(encl->size) + 1 + (index * PAGE_SIZE) / sizeof(struct sgx_pcmd);
> > }
> 
> I've looked at this for about 10 minutes and been simultaneously
> confused as to whether it is right or wrong.  That makes it
> automatically wrong. :)
> 
> First, this isn't calculating a "PCMD number".  It's calculating backing
> offset.  The "PCMD number" calculation is only a part of it.  I think
> that makes the naming rather sloppy.
> 
> Second, I think the typing is sloppy.  page_index for example:
> 
> > static int __sgx_encl_eldu(struct sgx_encl_page *encl_page,
> >                            struct sgx_epc_page *epc_page,
> >                            struct sgx_epc_page *secs_page)
> > {
> ...
> >         pgoff_t page_index;
> 
> It's storing a page number:
> 
>                 page_index = PFN_DOWN(encl->size);
> 
> not a real offset-into-a-file.  That makes it even more confusing when
> 'page_index' crosses a function boundary, gets renamed to 'index' and
> then its units get confused.
> 
> /*
>  * Given a page number within an enclave (@epc_page_nr), calculate the
>  * offset in bytes into the backing file where that page's PCMD is
>  * located.
>  */
> -static inline pgoff_t sgx_encl_get_backing_pcmd_nr(struct sgx_encl
> *encl, pgoff_t index)
> +static inline pgoff_t sgx_page_nr_to_pcmd_nr(struct sgx_encl *encl,
> unsigned long epc_page_nr)
> {
> 	pgoff_t last_epc_offset = PFN_DOWN(encl->size);
> 	pgoff_t pcmd_offset;
> 
> 	// The SECS page is stashed in a slot after the
> 	// last normal EPC page.  Leave space for it:
> 	last_epc_offset++;
> 
> 	pcmd_offset = epc_page_nr / sizeof(struct sgx_pcmd);
> 
> 	return last_epc_offset + pcmd_offset;
> }
> 
> Looking at that, I still think your *original* version is correct.
> 
> Am I just all twisted around from looking at this code too much?  Could
> you please take another look and send a new version of the patch?

I gave some though and this is what I came up with.

So what do you think about this:

	pgoff_t pcmd_off = encl->size + PAGE_SIZE + page_idex * sizeof(struct sgx_pcmd); 
	
This makes the calculations quite simple:

	backing->pcmd = pcmd;
	backing->pcmd_offset = pcmd_off & PAGE_SIZE;

and

	pcmd = sgx_encl_get_backing_page(encl, PFN_DOWN(pcmd_off));
	
Then pgoff_t is actually also used for what it is meant to be.

/Jarkko
diff mbox series

Patch

diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c
index 001808e3901c..ea43c10e5458 100644
--- a/arch/x86/kernel/cpu/sgx/encl.c
+++ b/arch/x86/kernel/cpu/sgx/encl.c
@@ -12,6 +12,27 @@ 
 #include "encls.h"
 #include "sgx.h"
 
+
+/*
+ * Get the page number of the page in the backing storage, which stores the PCMD
+ * of the enclave page in the given page index.  PCMD pages are located after
+ * the backing storage for the visible enclave pages and SECS.
+ */
+static inline pgoff_t sgx_encl_get_backing_pcmd_nr(struct sgx_encl *encl, pgoff_t index)
+{
+	return PFN_DOWN(encl->size) + 1 + (index / sizeof(struct sgx_pcmd));
+}
+
+/*
+ * Free a page from the backing storage in the given page index.
+ */
+static inline void sgx_encl_truncate_backing_page(struct sgx_encl *encl, pgoff_t index)
+{
+	struct inode *inode = file_inode(encl->backing);
+
+	shmem_truncate_range(inode, PFN_PHYS(index), PFN_PHYS(index) + PAGE_SIZE - 1);
+}
+
 /*
  * ELDU: Load an EPC page as unblocked. For more info, see "OS Management of EPC
  * Pages" in the SDM.
@@ -24,7 +45,10 @@  static int __sgx_encl_eldu(struct sgx_encl_page *encl_page,
 	struct sgx_encl *encl = encl_page->encl;
 	struct sgx_pageinfo pginfo;
 	struct sgx_backing b;
+	bool pcmd_page_empty;
 	pgoff_t page_index;
+	pgoff_t pcmd_index;
+	u8 *pcmd_page;
 	int ret;
 
 	if (secs_page)
@@ -38,8 +62,8 @@  static int __sgx_encl_eldu(struct sgx_encl_page *encl_page,
 
 	pginfo.addr = encl_page->desc & PAGE_MASK;
 	pginfo.contents = (unsigned long)kmap_atomic(b.contents);
-	pginfo.metadata = (unsigned long)kmap_atomic(b.pcmd) +
-			  b.pcmd_offset;
+	pcmd_page = kmap_atomic(b.pcmd);
+	pginfo.metadata = (unsigned long)pcmd_page + b.pcmd_offset;
 
 	if (secs_page)
 		pginfo.secs = (u64)sgx_get_epc_virt_addr(secs_page);
@@ -55,11 +79,27 @@  static int __sgx_encl_eldu(struct sgx_encl_page *encl_page,
 		ret = -EFAULT;
 	}
 
-	kunmap_atomic((void *)(unsigned long)(pginfo.metadata - b.pcmd_offset));
+	memset(pcmd_page + b.pcmd_offset, 0, sizeof(struct sgx_pcmd));
+
+	/*
+	 * The area for the PCMD in the page was zeroed above.  Check if the
+	 * whole page is now empty meaning that all PCMD's have been zeroed:
+	 */
+	pcmd_page_empty = !memchr_inv(pcmd_page, 0, PAGE_SIZE);
+
+	kunmap_atomic(pcmd_page);
 	kunmap_atomic((void *)(unsigned long)pginfo.contents);
 
 	sgx_encl_put_backing(&b, false);
 
+	/* Free the backing memory. */
+	sgx_encl_truncate_backing_page(encl, page_index);
+
+	if (pcmd_page_empty) {
+		pcmd_index = sgx_encl_get_backing_pcmd_nr(encl, page_index);
+		sgx_encl_truncate_backing_page(encl, pcmd_index);
+	}
+
 	return ret;
 }
 
@@ -577,7 +617,7 @@  static struct page *sgx_encl_get_backing_page(struct sgx_encl *encl,
 int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index,
 			 struct sgx_backing *backing)
 {
-	pgoff_t pcmd_index = PFN_DOWN(encl->size) + 1 + (page_index >> 5);
+	pgoff_t pcmd_index = sgx_encl_get_backing_pcmd_nr(encl, page_index);
 	struct page *contents;
 	struct page *pcmd;