diff mbox series

[2/2] x86: sgx_vepc: implement SGX_IOC_VEPC_REMOVE_ALL ioctl

Message ID 20210920125401.2389105-3-pbonzini@redhat.com (mailing list archive)
State New, archived
Headers show
Series x86: sgx_vepc: implement ioctl to EREMOVE all pages | expand

Commit Message

Paolo Bonzini Sept. 20, 2021, 12:54 p.m. UTC
For bare-metal SGX on real hardware, the hardware provides guarantees
SGX state at reboot.  For instance, all pages start out uninitialized.
The vepc driver provides a similar guarantee today for freshly-opened
vepc instances, but guests such as Windows expect all pages to be in
uninitialized state on startup, including after every guest reboot.

Some userspace implementations of virtual SGX would rather avoid having
to close and reopen the /dev/sgx_vepc file descriptor and re-mmap the
virtual EPC.  For example, they could be sandboxing themselves before the
guest starts in order to mitigate exploits from untrusted guests,
forbidding further calls to open().

Therefore, add a ioctl that does this with EREMOVE.  Userspace can
invoke the ioctl to bring its vEPC pages back to uninitialized state.
There is a possibility that some pages fail to be removed if they are
SECS pages, so the ioctl returns the number of EREMOVE failures.  The
correct usage is documented in sgx.rst.

Tested-by: Yang Zhong <yang.zhong@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/x86/sgx.rst       | 14 +++++++++++++
 arch/x86/include/uapi/asm/sgx.h |  2 ++
 arch/x86/kernel/cpu/sgx/virt.c  | 36 +++++++++++++++++++++++++++++++++
 3 files changed, 52 insertions(+)

Comments

Huang, Kai Sept. 20, 2021, 10:17 p.m. UTC | #1
On Mon, 20 Sep 2021 08:54:01 -0400 Paolo Bonzini wrote:
> For bare-metal SGX on real hardware, the hardware provides guarantees
> SGX state at reboot.  For instance, all pages start out uninitialized.
> The vepc driver provides a similar guarantee today for freshly-opened
> vepc instances, but guests such as Windows expect all pages to be in
> uninitialized state on startup, including after every guest reboot.
> 
> Some userspace implementations of virtual SGX would rather avoid having
> to close and reopen the /dev/sgx_vepc file descriptor and re-mmap the
> virtual EPC.  For example, they could be sandboxing themselves before the
> guest starts in order to mitigate exploits from untrusted guests,
> forbidding further calls to open().
> 
> Therefore, add a ioctl that does this with EREMOVE.  Userspace can
> invoke the ioctl to bring its vEPC pages back to uninitialized state.
> There is a possibility that some pages fail to be removed if they are
> SECS pages, so the ioctl returns the number of EREMOVE failures.  The
> correct usage is documented in sgx.rst.
> 
> Tested-by: Yang Zhong <yang.zhong@intel.com>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>  Documentation/x86/sgx.rst       | 14 +++++++++++++
>  arch/x86/include/uapi/asm/sgx.h |  2 ++
>  arch/x86/kernel/cpu/sgx/virt.c  | 36 +++++++++++++++++++++++++++++++++
>  3 files changed, 52 insertions(+)
> 
> diff --git a/Documentation/x86/sgx.rst b/Documentation/x86/sgx.rst
> index dd0ac96ff9ef..b855db96c6c6 100644
> --- a/Documentation/x86/sgx.rst
> +++ b/Documentation/x86/sgx.rst
> @@ -250,3 +250,17 @@ user wants to deploy SGX applications both on the host and in guests
>  on the same machine, the user should reserve enough EPC (by taking out
>  total virtual EPC size of all SGX VMs from the physical EPC size) for
>  host SGX applications so they can run with acceptable performance.
> +
> +Some guests, such as Windows, require that all pages are uninitialized
> +at startup or after a guest reboot.  

I would say this is not requirement of some windows guests, but requirement of
correctly emulating architectural behaviour.  On bare-metal EPC is lost on
reboot, so on VM reboot, virtual EPC should be reset too, regardless what
guest VM is.  

Because this state can be reached
> +only through the privileged ``ENCLS[EREMOVE]`` instruction, ``/dev/sgx_vepc``
> +provides the ``SGX_IOC_VEPC_REMOVE_ALL`` ioctl to execute the instruction on
> +all pages in the virtual EPC.
> +
> +The ioctl will often not able to remove SECS pages, in case their child
> +pages have not gone through ``EREMOVE`` yet; therefore, the ioctl returns the
> +number of pages that failed to be removed.  ``SGX_IOC_VEPC_REMOVE_ALL`` should
> +first be invoked on all the ``/dev/sgx_vepc`` file descriptors mapped
> +into the guest; a second call to the ioctl will be able to remove all
> +leftover pages and will return 0.  Any other return value on the second call
> +would be a symptom of a bug in either Linux or the userspace client.

Maybe also worth to mention userspace should guarantee there's no vcpu running
inside guest enclave when resetting guest's virtual EPC.

> diff --git a/arch/x86/include/uapi/asm/sgx.h b/arch/x86/include/uapi/asm/sgx.h
> index 9690d6899ad9..f79d84ce8033 100644
> --- a/arch/x86/include/uapi/asm/sgx.h
> +++ b/arch/x86/include/uapi/asm/sgx.h
> @@ -27,6 +27,8 @@ enum sgx_page_flags {
>  	_IOW(SGX_MAGIC, 0x02, struct sgx_enclave_init)
>  #define SGX_IOC_ENCLAVE_PROVISION \
>  	_IOW(SGX_MAGIC, 0x03, struct sgx_enclave_provision)
> +#define SGX_IOC_VEPC_REMOVE_ALL \
> +	_IO(SGX_MAGIC, 0x04)
>  
>  /**
>   * struct sgx_enclave_create - parameter structure for the
> diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c
> index 59b9c13121cd..81dc186fda2e 100644
> --- a/arch/x86/kernel/cpu/sgx/virt.c
> +++ b/arch/x86/kernel/cpu/sgx/virt.c
> @@ -154,6 +154,24 @@ static int sgx_vepc_free_page(struct sgx_epc_page *epc_page)
>  	return 0;
>  }
>  
> +static long sgx_vepc_remove_all(struct sgx_vepc *vepc)
> +{
> +	struct sgx_epc_page *entry;
> +	unsigned long index;
> +	long failures = 0;
> +
> +	xa_for_each(&vepc->page_array, index, entry)
> +		if (sgx_vepc_remove_page(entry))
> +			failures++;

I think need to hold vepc->lock?

> +
> +	/*
> +	 * Return the number of pages that failed to be removed, so
> +	 * userspace knows that there are still SECS pages lying
> +	 * around.
> +	 */
> +	return failures;
> +}
> +
>  static int sgx_vepc_release(struct inode *inode, struct file *file)
>  {
>  	struct sgx_vepc *vepc = file->private_data;
> @@ -239,9 +257,27 @@ static int sgx_vepc_open(struct inode *inode, struct file *file)
>  	return 0;
>  }
>  
> +static long sgx_vepc_ioctl(struct file *file,
> +			   unsigned int cmd, unsigned long arg)
> +{
> +	struct sgx_vepc *vepc = file->private_data;
> +
> +	switch (cmd) {
> +	case SGX_IOC_VEPC_REMOVE_ALL:
> +		if (arg)
> +			return -EINVAL;
> +		return sgx_vepc_remove_all(vepc);
> +
> +	default:
> +		return -ENOTTY;
> +	}
> +}
> +
>  static const struct file_operations sgx_vepc_fops = {
>  	.owner		= THIS_MODULE,
>  	.open		= sgx_vepc_open,
> +	.unlocked_ioctl	= sgx_vepc_ioctl,
> +	.compat_ioctl	= sgx_vepc_ioctl,
>  	.release	= sgx_vepc_release,
>  	.mmap		= sgx_vepc_mmap,
>  };
> -- 
> 2.27.0
>
Dave Hansen Sept. 20, 2021, 11:09 p.m. UTC | #2
On 9/20/21 3:17 PM, Kai Huang wrote:
>> +The ioctl will often not able to remove SECS pages, in case their child
>> +pages have not gone through ``EREMOVE`` yet; therefore, the ioctl returns the
>> +number of pages that failed to be removed.  ``SGX_IOC_VEPC_REMOVE_ALL`` should
>> +first be invoked on all the ``/dev/sgx_vepc`` file descriptors mapped
>> +into the guest; a second call to the ioctl will be able to remove all
>> +leftover pages and will return 0.  Any other return value on the second call
>> +would be a symptom of a bug in either Linux or the userspace client.
> Maybe also worth to mention userspace should guarantee there's no vcpu running
> inside guest enclave when resetting guest's virtual EPC.

Why, specifically?

Is it because EREMOVE will also fail if there is a CPU running in the
enclave?

Maybe we should say something like:

The ioctl() returns the number of pages where removal failed.  Callers
of the ioctl() need to handle two sources of failure:

1) Page removal will always fail when any thread is running in the
   enclave to which the page belongs.  ioctl() users can avoid these
   failures by preventing execution of any vcpu which maps the virtual
   EPC.

2) Page removal will also fail for SGX "SECS" metadata pages which still
   have child pages.  SECS pages can be removed by executing
   ``SGX_IOC_VEPC_REMOVE_ALL`` on all ``/dev/sgx_vepc`` file descriptors
   mapped into the guest.  Yhis means that the ioctl() must be called
   twice: an initial set of calls to remove child pages and a subsequent
   set of calls to remove SECS pages.

   It indicates a bug in the kernel or the userspace client if any of
   the second round of ``SGX_IOC_VEPC_REMOVE_ALL`` calls fails (has a
   return code other than 0).
Paolo Bonzini Sept. 21, 2021, 10:29 a.m. UTC | #3
On 21/09/21 01:09, Dave Hansen wrote:
>> Maybe also worth to mention userspace should guarantee there's no vcpu running
>> inside guest enclave when resetting guest's virtual EPC.
> Why, specifically?
> 
> Is it because EREMOVE will also fail if there is a CPU running in the
> enclave?

Yes, and SGX_ENCLAVE_ACT would cause a WARN.  Good catch, Kai, I'll fix it.

Paolo
diff mbox series

Patch

diff --git a/Documentation/x86/sgx.rst b/Documentation/x86/sgx.rst
index dd0ac96ff9ef..b855db96c6c6 100644
--- a/Documentation/x86/sgx.rst
+++ b/Documentation/x86/sgx.rst
@@ -250,3 +250,17 @@  user wants to deploy SGX applications both on the host and in guests
 on the same machine, the user should reserve enough EPC (by taking out
 total virtual EPC size of all SGX VMs from the physical EPC size) for
 host SGX applications so they can run with acceptable performance.
+
+Some guests, such as Windows, require that all pages are uninitialized
+at startup or after a guest reboot.  Because this state can be reached
+only through the privileged ``ENCLS[EREMOVE]`` instruction, ``/dev/sgx_vepc``
+provides the ``SGX_IOC_VEPC_REMOVE_ALL`` ioctl to execute the instruction on
+all pages in the virtual EPC.
+
+The ioctl will often not able to remove SECS pages, in case their child
+pages have not gone through ``EREMOVE`` yet; therefore, the ioctl returns the
+number of pages that failed to be removed.  ``SGX_IOC_VEPC_REMOVE_ALL`` should
+first be invoked on all the ``/dev/sgx_vepc`` file descriptors mapped
+into the guest; a second call to the ioctl will be able to remove all
+leftover pages and will return 0.  Any other return value on the second call
+would be a symptom of a bug in either Linux or the userspace client.
diff --git a/arch/x86/include/uapi/asm/sgx.h b/arch/x86/include/uapi/asm/sgx.h
index 9690d6899ad9..f79d84ce8033 100644
--- a/arch/x86/include/uapi/asm/sgx.h
+++ b/arch/x86/include/uapi/asm/sgx.h
@@ -27,6 +27,8 @@  enum sgx_page_flags {
 	_IOW(SGX_MAGIC, 0x02, struct sgx_enclave_init)
 #define SGX_IOC_ENCLAVE_PROVISION \
 	_IOW(SGX_MAGIC, 0x03, struct sgx_enclave_provision)
+#define SGX_IOC_VEPC_REMOVE_ALL \
+	_IO(SGX_MAGIC, 0x04)
 
 /**
  * struct sgx_enclave_create - parameter structure for the
diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c
index 59b9c13121cd..81dc186fda2e 100644
--- a/arch/x86/kernel/cpu/sgx/virt.c
+++ b/arch/x86/kernel/cpu/sgx/virt.c
@@ -154,6 +154,24 @@  static int sgx_vepc_free_page(struct sgx_epc_page *epc_page)
 	return 0;
 }
 
+static long sgx_vepc_remove_all(struct sgx_vepc *vepc)
+{
+	struct sgx_epc_page *entry;
+	unsigned long index;
+	long failures = 0;
+
+	xa_for_each(&vepc->page_array, index, entry)
+		if (sgx_vepc_remove_page(entry))
+			failures++;
+
+	/*
+	 * Return the number of pages that failed to be removed, so
+	 * userspace knows that there are still SECS pages lying
+	 * around.
+	 */
+	return failures;
+}
+
 static int sgx_vepc_release(struct inode *inode, struct file *file)
 {
 	struct sgx_vepc *vepc = file->private_data;
@@ -239,9 +257,27 @@  static int sgx_vepc_open(struct inode *inode, struct file *file)
 	return 0;
 }
 
+static long sgx_vepc_ioctl(struct file *file,
+			   unsigned int cmd, unsigned long arg)
+{
+	struct sgx_vepc *vepc = file->private_data;
+
+	switch (cmd) {
+	case SGX_IOC_VEPC_REMOVE_ALL:
+		if (arg)
+			return -EINVAL;
+		return sgx_vepc_remove_all(vepc);
+
+	default:
+		return -ENOTTY;
+	}
+}
+
 static const struct file_operations sgx_vepc_fops = {
 	.owner		= THIS_MODULE,
 	.open		= sgx_vepc_open,
+	.unlocked_ioctl	= sgx_vepc_ioctl,
+	.compat_ioctl	= sgx_vepc_ioctl,
 	.release	= sgx_vepc_release,
 	.mmap		= sgx_vepc_mmap,
 };