diff mbox series

[v39,11/24] x86/sgx: Add SGX enclave driver

Message ID 20201003045059.665934-12-jarkko.sakkinen@linux.intel.com (mailing list archive)
State New, archived
Headers show
Series None | expand

Commit Message

Jarkko Sakkinen Oct. 3, 2020, 4:50 a.m. UTC
Intel Software Guard eXtensions (SGX) is a set of CPU instructions that can
be used by applications to set aside private regions of code and data. The
code outside the SGX hosted software entity is prevented from accessing the
memory inside the enclave by the CPU. We call these entities enclaves.

Add a driver that provides an ioctl API to construct and run enclaves.
Enclaves are constructed from pages residing in reserved physical memory
areas. The contents of these pages can only be accessed when they are
mapped as part of an enclave, by a hardware thread running inside the
enclave.

The starting state of an enclave consists of a fixed measured set of
pages that are copied to the EPC during the construction process by
using the opcode ENCLS leaf functions and Software Enclave Control
Structure (SECS) that defines the enclave properties.

Enclaves are constructed by using ENCLS leaf functions ECREATE, EADD and
EINIT. ECREATE initializes SECS, EADD copies pages from system memory to
the EPC and EINIT checks a given signed measurement and moves the enclave
into a state ready for execution.

An initialized enclave can only be accessed through special Thread Control
Structure (TCS) pages by using ENCLU (ring-3 only) leaf EENTER.  This leaf
function converts a thread into enclave mode and continues the execution in
the offset defined by the TCS provided to EENTER. An enclave is exited
through syscall, exception, interrupts or by explicitly calling another
ENCLU leaf EEXIT.

The mmap() permissions are capped by the contained enclave page
permissions. The mapped areas must also be populated, i.e. each page
address must contain a page. This logic is implemented in
sgx_encl_may_map().

Cc: linux-security-module@vger.kernel.org
Cc: linux-mm@kvack.org
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Matthew Wilcox <willy@infradead.org>
Acked-by: Jethro Beekman <jethro@fortanix.com>
Tested-by: Jethro Beekman <jethro@fortanix.com>
Tested-by: Haitao Huang <haitao.huang@linux.intel.com>
Tested-by: Chunyang Hui <sanqian.hcy@antfin.com>
Tested-by: Jordan Hand <jorhand@linux.microsoft.com>
Tested-by: Nathaniel McCallum <npmccallum@redhat.com>
Tested-by: Seth Moore <sethmo@google.com>
Tested-by: Darren Kenny <darren.kenny@oracle.com>
Reviewed-by: Darren Kenny <darren.kenny@oracle.com>
Co-developed-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Co-developed-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
---
 arch/x86/kernel/cpu/sgx/Makefile |   2 +
 arch/x86/kernel/cpu/sgx/driver.c | 173 ++++++++++++++++
 arch/x86/kernel/cpu/sgx/driver.h |  29 +++
 arch/x86/kernel/cpu/sgx/encl.c   | 331 +++++++++++++++++++++++++++++++
 arch/x86/kernel/cpu/sgx/encl.h   |  85 ++++++++
 arch/x86/kernel/cpu/sgx/main.c   |  11 +
 6 files changed, 631 insertions(+)
 create mode 100644 arch/x86/kernel/cpu/sgx/driver.c
 create mode 100644 arch/x86/kernel/cpu/sgx/driver.h
 create mode 100644 arch/x86/kernel/cpu/sgx/encl.c
 create mode 100644 arch/x86/kernel/cpu/sgx/encl.h

Comments

Greg KH Oct. 3, 2020, 2:39 p.m. UTC | #1
On Sat, Oct 03, 2020 at 07:50:46AM +0300, Jarkko Sakkinen wrote:
> Intel Software Guard eXtensions (SGX) is a set of CPU instructions that can
> be used by applications to set aside private regions of code and data. The
> code outside the SGX hosted software entity is prevented from accessing the
> memory inside the enclave by the CPU. We call these entities enclaves.
> 
> Add a driver that provides an ioctl API to construct and run enclaves.
> Enclaves are constructed from pages residing in reserved physical memory
> areas. The contents of these pages can only be accessed when they are
> mapped as part of an enclave, by a hardware thread running inside the
> enclave.
> 
> The starting state of an enclave consists of a fixed measured set of
> pages that are copied to the EPC during the construction process by
> using the opcode ENCLS leaf functions and Software Enclave Control
> Structure (SECS) that defines the enclave properties.
> 
> Enclaves are constructed by using ENCLS leaf functions ECREATE, EADD and
> EINIT. ECREATE initializes SECS, EADD copies pages from system memory to
> the EPC and EINIT checks a given signed measurement and moves the enclave
> into a state ready for execution.
> 
> An initialized enclave can only be accessed through special Thread Control
> Structure (TCS) pages by using ENCLU (ring-3 only) leaf EENTER.  This leaf
> function converts a thread into enclave mode and continues the execution in
> the offset defined by the TCS provided to EENTER. An enclave is exited
> through syscall, exception, interrupts or by explicitly calling another
> ENCLU leaf EEXIT.
> 
> The mmap() permissions are capped by the contained enclave page
> permissions. The mapped areas must also be populated, i.e. each page
> address must contain a page. This logic is implemented in
> sgx_encl_may_map().
> 
> Cc: linux-security-module@vger.kernel.org
> Cc: linux-mm@kvack.org
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Matthew Wilcox <willy@infradead.org>
> Acked-by: Jethro Beekman <jethro@fortanix.com>
> Tested-by: Jethro Beekman <jethro@fortanix.com>
> Tested-by: Haitao Huang <haitao.huang@linux.intel.com>
> Tested-by: Chunyang Hui <sanqian.hcy@antfin.com>
> Tested-by: Jordan Hand <jorhand@linux.microsoft.com>
> Tested-by: Nathaniel McCallum <npmccallum@redhat.com>
> Tested-by: Seth Moore <sethmo@google.com>
> Tested-by: Darren Kenny <darren.kenny@oracle.com>
> Reviewed-by: Darren Kenny <darren.kenny@oracle.com>
> Co-developed-by: Sean Christopherson <sean.j.christopherson@intel.com>
> Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
> Co-developed-by: Suresh Siddha <suresh.b.siddha@intel.com>
> Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
> Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
> ---
>  arch/x86/kernel/cpu/sgx/Makefile |   2 +
>  arch/x86/kernel/cpu/sgx/driver.c | 173 ++++++++++++++++
>  arch/x86/kernel/cpu/sgx/driver.h |  29 +++
>  arch/x86/kernel/cpu/sgx/encl.c   | 331 +++++++++++++++++++++++++++++++
>  arch/x86/kernel/cpu/sgx/encl.h   |  85 ++++++++
>  arch/x86/kernel/cpu/sgx/main.c   |  11 +
>  6 files changed, 631 insertions(+)
>  create mode 100644 arch/x86/kernel/cpu/sgx/driver.c
>  create mode 100644 arch/x86/kernel/cpu/sgx/driver.h
>  create mode 100644 arch/x86/kernel/cpu/sgx/encl.c
>  create mode 100644 arch/x86/kernel/cpu/sgx/encl.h
> 
> diff --git a/arch/x86/kernel/cpu/sgx/Makefile b/arch/x86/kernel/cpu/sgx/Makefile
> index 79510ce01b3b..3fc451120735 100644
> --- a/arch/x86/kernel/cpu/sgx/Makefile
> +++ b/arch/x86/kernel/cpu/sgx/Makefile
> @@ -1,2 +1,4 @@
>  obj-y += \
> +	driver.o \
> +	encl.o \
>  	main.o
> diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c
> new file mode 100644
> index 000000000000..f54da5f19c2b
> --- /dev/null
> +++ b/arch/x86/kernel/cpu/sgx/driver.c
> @@ -0,0 +1,173 @@
> +// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)

You use gpl-only header files in this file, so how in the world can it
be bsd-3 licensed?

Please get your legal department to agree with this, after you explain
to them how you are mixing gpl2-only code in with this file.

> +// Copyright(c) 2016-18 Intel Corporation.

Dates are hard to get right :(

> +
> +#include <linux/acpi.h>
> +#include <linux/miscdevice.h>
> +#include <linux/mman.h>
> +#include <linux/security.h>
> +#include <linux/suspend.h>
> +#include <asm/traps.h>
> +#include "driver.h"
> +#include "encl.h"
> +
> +u64 sgx_encl_size_max_32;
> +u64 sgx_encl_size_max_64;
> +u32 sgx_misc_reserved_mask;
> +u64 sgx_attributes_reserved_mask;
> +u64 sgx_xfrm_reserved_mask = ~0x3;
> +u32 sgx_xsave_size_tbl[64];
> +
> +static int sgx_open(struct inode *inode, struct file *file)
> +{
> +	struct sgx_encl *encl;
> +	int ret;
> +
> +	encl = kzalloc(sizeof(*encl), GFP_KERNEL);
> +	if (!encl)
> +		return -ENOMEM;
> +
> +	atomic_set(&encl->flags, 0);
> +	kref_init(&encl->refcount);
> +	xa_init(&encl->page_array);
> +	mutex_init(&encl->lock);
> +	INIT_LIST_HEAD(&encl->mm_list);
> +	spin_lock_init(&encl->mm_lock);
> +
> +	ret = init_srcu_struct(&encl->srcu);
> +	if (ret) {
> +		kfree(encl);
> +		return ret;
> +	}
> +
> +	file->private_data = encl;
> +
> +	return 0;
> +}
> +
> +static int sgx_release(struct inode *inode, struct file *file)
> +{
> +	struct sgx_encl *encl = file->private_data;
> +	struct sgx_encl_mm *encl_mm;
> +
> +	for ( ; ; )  {
> +		spin_lock(&encl->mm_lock);
> +
> +		if (list_empty(&encl->mm_list)) {
> +			encl_mm = NULL;
> +		} else {
> +			encl_mm = list_first_entry(&encl->mm_list,
> +						   struct sgx_encl_mm, list);
> +			list_del_rcu(&encl_mm->list);
> +		}
> +
> +		spin_unlock(&encl->mm_lock);
> +
> +		/* The list is empty, ready to go. */
> +		if (!encl_mm)
> +			break;
> +
> +		synchronize_srcu(&encl->srcu);
> +		mmu_notifier_unregister(&encl_mm->mmu_notifier, encl_mm->mm);
> +		kfree(encl_mm);
> +	}
> +
> +	mutex_lock(&encl->lock);
> +	atomic_or(SGX_ENCL_DEAD, &encl->flags);

So you set a flag that this is dead, and then instantly delete it?  Why
does that matter?  I see you check for this flag elsewhere, but as you
are just about to delete this structure, how can this be an issue?

> +	mutex_unlock(&encl->lock);
> +
> +	kref_put(&encl->refcount, sgx_encl_release);

Don't you need to hold the lock across the put?  If not, what is
serializing this?

But an even larger comment, why is this reference count needed at all?

You never grab it except at init time, and you free it at close time.
Why not rely on the reference counting that the vfs ensures you?



> +	return 0;
> +}
> +
> +static int sgx_mmap(struct file *file, struct vm_area_struct *vma)
> +{
> +	struct sgx_encl *encl = file->private_data;
> +	int ret;
> +
> +	ret = sgx_encl_may_map(encl, vma->vm_start, vma->vm_end, vma->vm_flags);
> +	if (ret)
> +		return ret;
> +
> +	ret = sgx_encl_mm_add(encl, vma->vm_mm);
> +	if (ret)
> +		return ret;
> +
> +	vma->vm_ops = &sgx_vm_ops;
> +	vma->vm_flags |= VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | VM_IO;
> +	vma->vm_private_data = encl;
> +
> +	return 0;
> +}
> +
> +static unsigned long sgx_get_unmapped_area(struct file *file,
> +					   unsigned long addr,
> +					   unsigned long len,
> +					   unsigned long pgoff,
> +					   unsigned long flags)
> +{
> +	if ((flags & MAP_TYPE) == MAP_PRIVATE)
> +		return -EINVAL;
> +
> +	if (flags & MAP_FIXED)
> +		return addr;
> +
> +	return current->mm->get_unmapped_area(file, addr, len, pgoff, flags);
> +}
> +
> +static const struct file_operations sgx_encl_fops = {
> +	.owner			= THIS_MODULE,
> +	.open			= sgx_open,
> +	.release		= sgx_release,
> +	.mmap			= sgx_mmap,
> +	.get_unmapped_area	= sgx_get_unmapped_area,
> +};
> +
> +static struct miscdevice sgx_dev_enclave = {
> +	.minor = MISC_DYNAMIC_MINOR,
> +	.name = "enclave",
> +	.nodename = "sgx/enclave",

A subdir for a single device node?  Ok, odd, but why not just
"sgx_enclave"?  How "special" is this device node?

thanks,

greg k-h
Matthew Wilcox Oct. 3, 2020, 7:54 p.m. UTC | #2
On Sat, Oct 03, 2020 at 07:50:46AM +0300, Jarkko Sakkinen wrote:
> +	XA_STATE(xas, &encl->page_array, idx_start);
> +
> +	/*
> +	 * Disallow READ_IMPLIES_EXEC tasks as their VMA permissions might
> +	 * conflict with the enclave page permissions.
> +	 */
> +	if (current->personality & READ_IMPLIES_EXEC)
> +		return -EACCES;
> +
> +	xas_for_each(&xas, page, idx_end)
> +		if (!page || (~page->vm_max_prot_bits & vm_prot_bits))
> +			return -EACCES;

You're iterating the array without holding any lock that the XArray knows
about.  If you're OK with another thread adding/removing pages behind your
back, or there's a higher level lock (the mmap_sem?) protecting the XArray
from being modified while you walk it, then hold the rcu_read_lock()
while walking the array.  Otherwise you can prevent modification by
calling xas_lock(&xas) and xas_unlock()..

> +	return 0;
> +}
> +
> +static int sgx_vma_mprotect(struct vm_area_struct *vma,
> +			    struct vm_area_struct **pprev, unsigned long start,
> +			    unsigned long end, unsigned long newflags)
> +{
> +	int ret;
> +
> +	ret = sgx_encl_may_map(vma->vm_private_data, start, end, newflags);
> +	if (ret)
> +		return ret;
> +
> +	return mprotect_fixup(vma, pprev, start, end, newflags);
> +}
> +
> +const struct vm_operations_struct sgx_vm_ops = {
> +	.open = sgx_vma_open,
> +	.fault = sgx_vma_fault,
> +	.mprotect = sgx_vma_mprotect,
> +};
> +
> +/**
> + * sgx_encl_find - find an enclave
> + * @mm:		mm struct of the current process
> + * @addr:	address in the ELRANGE
> + * @vma:	the resulting VMA
> + *
> + * Find an enclave identified by the given address. Give back a VMA that is
> + * part of the enclave and located in that address. The VMA is given back if it
> + * is a proper enclave VMA even if an &sgx_encl instance does not exist yet
> + * (enclave creation has not been performed).
> + *
> + * Return:
> + *   0 on success,
> + *   -EINVAL if an enclave was not found,
> + *   -ENOENT if the enclave has not been created yet
> + */
> +int sgx_encl_find(struct mm_struct *mm, unsigned long addr,
> +		  struct vm_area_struct **vma)
> +{
> +	struct vm_area_struct *result;
> +	struct sgx_encl *encl;
> +
> +	result = find_vma(mm, addr);
> +	if (!result || result->vm_ops != &sgx_vm_ops || addr < result->vm_start)
> +		return -EINVAL;
> +
> +	encl = result->vm_private_data;
> +	*vma = result;
> +
> +	return encl ? 0 : -ENOENT;
> +}
> +
> +/**
> + * sgx_encl_destroy() - destroy enclave resources
> + * @encl:	an enclave pointer
> + */
> +void sgx_encl_destroy(struct sgx_encl *encl)
> +{
> +	struct sgx_encl_page *entry;
> +	unsigned long index;
> +
> +	atomic_or(SGX_ENCL_DEAD, &encl->flags);
> +
> +	xa_for_each(&encl->page_array, index, entry) {
> +		if (entry->epc_page) {
> +			sgx_free_epc_page(entry->epc_page);
> +			encl->secs_child_cnt--;
> +			entry->epc_page = NULL;
> +		}
> +
> +		kfree(entry);
> +	}
> +
> +	xa_destroy(&encl->page_array);
> +
> +	if (!encl->secs_child_cnt && encl->secs.epc_page) {
> +		sgx_free_epc_page(encl->secs.epc_page);
> +		encl->secs.epc_page = NULL;
> +	}
> +}
> +
> +/**
> + * sgx_encl_release - Destroy an enclave instance
> + * @kref:	address of a kref inside &sgx_encl
> + *
> + * Used together with kref_put(). Frees all the resources associated with the
> + * enclave and the instance itself.
> + */
> +void sgx_encl_release(struct kref *ref)
> +{
> +	struct sgx_encl *encl = container_of(ref, struct sgx_encl, refcount);
> +
> +	sgx_encl_destroy(encl);
> +
> +	if (encl->backing)
> +		fput(encl->backing);
> +
> +	cleanup_srcu_struct(&encl->srcu);
> +
> +	WARN_ON_ONCE(!list_empty(&encl->mm_list));
> +
> +	/* Detect EPC page leak's. */
> +	WARN_ON_ONCE(encl->secs_child_cnt);
> +	WARN_ON_ONCE(encl->secs.epc_page);
> +
> +	kfree(encl);
> +}
> diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h
> new file mode 100644
> index 000000000000..8ff445476657
> --- /dev/null
> +++ b/arch/x86/kernel/cpu/sgx/encl.h
> @@ -0,0 +1,85 @@
> +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
> +/**
> + * Copyright(c) 2016-19 Intel Corporation.
> + */
> +#ifndef _X86_ENCL_H
> +#define _X86_ENCL_H
> +
> +#include <linux/cpumask.h>
> +#include <linux/kref.h>
> +#include <linux/list.h>
> +#include <linux/mm_types.h>
> +#include <linux/mmu_notifier.h>
> +#include <linux/mutex.h>
> +#include <linux/notifier.h>
> +#include <linux/srcu.h>
> +#include <linux/workqueue.h>
> +#include <linux/xarray.h>
> +#include "sgx.h"
> +
> +/**
> + * enum sgx_encl_page_desc - defines bits for an enclave page's descriptor
> + * %SGX_ENCL_PAGE_ADDR_MASK:		Holds the virtual address of the page.
> + *
> + * The page address for SECS is zero and is used by the subsystem to recognize
> + * the SECS page.
> + */
> +enum sgx_encl_page_desc {
> +	/* Bits 11:3 are available when the page is not swapped. */
> +	SGX_ENCL_PAGE_ADDR_MASK		= PAGE_MASK,
> +};
> +
> +#define SGX_ENCL_PAGE_ADDR(page) \
> +	((page)->desc & SGX_ENCL_PAGE_ADDR_MASK)
> +
> +struct sgx_encl_page {
> +	unsigned long desc;
> +	unsigned long vm_max_prot_bits;
> +	struct sgx_epc_page *epc_page;
> +	struct sgx_encl *encl;
> +};
> +
> +enum sgx_encl_flags {
> +	SGX_ENCL_CREATED	= BIT(0),
> +	SGX_ENCL_INITIALIZED	= BIT(1),
> +	SGX_ENCL_DEBUG		= BIT(2),
> +	SGX_ENCL_DEAD		= BIT(3),
> +	SGX_ENCL_IOCTL		= BIT(4),
> +};
> +
> +struct sgx_encl_mm {
> +	struct sgx_encl *encl;
> +	struct mm_struct *mm;
> +	struct list_head list;
> +	struct mmu_notifier mmu_notifier;
> +};
> +
> +struct sgx_encl {
> +	atomic_t flags;
> +	unsigned int page_cnt;
> +	unsigned int secs_child_cnt;
> +	struct mutex lock;
> +	struct list_head mm_list;
> +	spinlock_t mm_lock;
> +	struct file *backing;
> +	struct kref refcount;
> +	struct srcu_struct srcu;
> +	unsigned long base;
> +	unsigned long size;
> +	unsigned long ssaframesize;
> +	struct xarray page_array;
> +	struct sgx_encl_page secs;
> +	cpumask_t cpumask;
> +};
> +
> +extern const struct vm_operations_struct sgx_vm_ops;
> +
> +int sgx_encl_find(struct mm_struct *mm, unsigned long addr,
> +		  struct vm_area_struct **vma);
> +void sgx_encl_destroy(struct sgx_encl *encl);
> +void sgx_encl_release(struct kref *ref);
> +int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm);
> +int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start,
> +		     unsigned long end, unsigned long vm_flags);
> +
> +#endif /* _X86_ENCL_H */
> diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
> index 97c6895fb6c9..4137254fb29e 100644
> --- a/arch/x86/kernel/cpu/sgx/main.c
> +++ b/arch/x86/kernel/cpu/sgx/main.c
> @@ -9,6 +9,8 @@
>  #include <linux/sched/mm.h>
>  #include <linux/sched/signal.h>
>  #include <linux/slab.h>
> +#include "driver.h"
> +#include "encl.h"
>  #include "encls.h"
>  
>  struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS];
> @@ -260,6 +262,8 @@ static bool __init sgx_page_cache_init(void)
>  
>  static void __init sgx_init(void)
>  {
> +	int ret;
> +
>  	if (!boot_cpu_has(X86_FEATURE_SGX))
>  		return;
>  
> @@ -269,8 +273,15 @@ static void __init sgx_init(void)
>  	if (!sgx_page_reclaimer_init())
>  		goto err_page_cache;
>  
> +	ret = sgx_drv_init();
> +	if (ret)
> +		goto err_kthread;
> +
>  	return;
>  
> +err_kthread:
> +	kthread_stop(ksgxswapd_tsk);
> +
>  err_page_cache:
>  	sgx_page_cache_teardown();
>  }
> -- 
> 2.25.1
>
Jarkko Sakkinen Oct. 4, 2020, 2:32 p.m. UTC | #3
On Sat, Oct 03, 2020 at 04:39:25PM +0200, Greg KH wrote:
> On Sat, Oct 03, 2020 at 07:50:46AM +0300, Jarkko Sakkinen wrote:
> > Intel Software Guard eXtensions (SGX) is a set of CPU instructions that can
> > be used by applications to set aside private regions of code and data. The
> > code outside the SGX hosted software entity is prevented from accessing the
> > memory inside the enclave by the CPU. We call these entities enclaves.
> > 
> > Add a driver that provides an ioctl API to construct and run enclaves.
> > Enclaves are constructed from pages residing in reserved physical memory
> > areas. The contents of these pages can only be accessed when they are
> > mapped as part of an enclave, by a hardware thread running inside the
> > enclave.
> > 
> > The starting state of an enclave consists of a fixed measured set of
> > pages that are copied to the EPC during the construction process by
> > using the opcode ENCLS leaf functions and Software Enclave Control
> > Structure (SECS) that defines the enclave properties.
> > 
> > Enclaves are constructed by using ENCLS leaf functions ECREATE, EADD and
> > EINIT. ECREATE initializes SECS, EADD copies pages from system memory to
> > the EPC and EINIT checks a given signed measurement and moves the enclave
> > into a state ready for execution.
> > 
> > An initialized enclave can only be accessed through special Thread Control
> > Structure (TCS) pages by using ENCLU (ring-3 only) leaf EENTER.  This leaf
> > function converts a thread into enclave mode and continues the execution in
> > the offset defined by the TCS provided to EENTER. An enclave is exited
> > through syscall, exception, interrupts or by explicitly calling another
> > ENCLU leaf EEXIT.
> > 
> > The mmap() permissions are capped by the contained enclave page
> > permissions. The mapped areas must also be populated, i.e. each page
> > address must contain a page. This logic is implemented in
> > sgx_encl_may_map().
> > 
> > Cc: linux-security-module@vger.kernel.org
> > Cc: linux-mm@kvack.org
> > Cc: Andrew Morton <akpm@linux-foundation.org>
> > Cc: Matthew Wilcox <willy@infradead.org>
> > Acked-by: Jethro Beekman <jethro@fortanix.com>
> > Tested-by: Jethro Beekman <jethro@fortanix.com>
> > Tested-by: Haitao Huang <haitao.huang@linux.intel.com>
> > Tested-by: Chunyang Hui <sanqian.hcy@antfin.com>
> > Tested-by: Jordan Hand <jorhand@linux.microsoft.com>
> > Tested-by: Nathaniel McCallum <npmccallum@redhat.com>
> > Tested-by: Seth Moore <sethmo@google.com>
> > Tested-by: Darren Kenny <darren.kenny@oracle.com>
> > Reviewed-by: Darren Kenny <darren.kenny@oracle.com>
> > Co-developed-by: Sean Christopherson <sean.j.christopherson@intel.com>
> > Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
> > Co-developed-by: Suresh Siddha <suresh.b.siddha@intel.com>
> > Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
> > Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
> > ---
> >  arch/x86/kernel/cpu/sgx/Makefile |   2 +
> >  arch/x86/kernel/cpu/sgx/driver.c | 173 ++++++++++++++++
> >  arch/x86/kernel/cpu/sgx/driver.h |  29 +++
> >  arch/x86/kernel/cpu/sgx/encl.c   | 331 +++++++++++++++++++++++++++++++
> >  arch/x86/kernel/cpu/sgx/encl.h   |  85 ++++++++
> >  arch/x86/kernel/cpu/sgx/main.c   |  11 +
> >  6 files changed, 631 insertions(+)
> >  create mode 100644 arch/x86/kernel/cpu/sgx/driver.c
> >  create mode 100644 arch/x86/kernel/cpu/sgx/driver.h
> >  create mode 100644 arch/x86/kernel/cpu/sgx/encl.c
> >  create mode 100644 arch/x86/kernel/cpu/sgx/encl.h
> > 
> > diff --git a/arch/x86/kernel/cpu/sgx/Makefile b/arch/x86/kernel/cpu/sgx/Makefile
> > index 79510ce01b3b..3fc451120735 100644
> > --- a/arch/x86/kernel/cpu/sgx/Makefile
> > +++ b/arch/x86/kernel/cpu/sgx/Makefile
> > @@ -1,2 +1,4 @@
> >  obj-y += \
> > +	driver.o \
> > +	encl.o \
> >  	main.o
> > diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c
> > new file mode 100644
> > index 000000000000..f54da5f19c2b
> > --- /dev/null
> > +++ b/arch/x86/kernel/cpu/sgx/driver.c
> > @@ -0,0 +1,173 @@
> > +// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
> 
> You use gpl-only header files in this file, so how in the world can it
> be bsd-3 licensed?
> 
> Please get your legal department to agree with this, after you explain
> to them how you are mixing gpl2-only code in with this file.

I'll do what I already stated that I will do. Should I do something
more?

> > +// Copyright(c) 2016-18 Intel Corporation.
> 
> Dates are hard to get right :(

Will fix.

> 
> > +
> > +#include <linux/acpi.h>
> > +#include <linux/miscdevice.h>
> > +#include <linux/mman.h>
> > +#include <linux/security.h>
> > +#include <linux/suspend.h>
> > +#include <asm/traps.h>
> > +#include "driver.h"
> > +#include "encl.h"
> > +
> > +u64 sgx_encl_size_max_32;
> > +u64 sgx_encl_size_max_64;
> > +u32 sgx_misc_reserved_mask;
> > +u64 sgx_attributes_reserved_mask;
> > +u64 sgx_xfrm_reserved_mask = ~0x3;
> > +u32 sgx_xsave_size_tbl[64];
> > +
> > +static int sgx_open(struct inode *inode, struct file *file)
> > +{
> > +	struct sgx_encl *encl;
> > +	int ret;
> > +
> > +	encl = kzalloc(sizeof(*encl), GFP_KERNEL);
> > +	if (!encl)
> > +		return -ENOMEM;
> > +
> > +	atomic_set(&encl->flags, 0);
> > +	kref_init(&encl->refcount);
> > +	xa_init(&encl->page_array);
> > +	mutex_init(&encl->lock);
> > +	INIT_LIST_HEAD(&encl->mm_list);
> > +	spin_lock_init(&encl->mm_lock);
> > +
> > +	ret = init_srcu_struct(&encl->srcu);
> > +	if (ret) {
> > +		kfree(encl);
> > +		return ret;
> > +	}
> > +
> > +	file->private_data = encl;
> > +
> > +	return 0;
> > +}
> > +
> > +static int sgx_release(struct inode *inode, struct file *file)
> > +{
> > +	struct sgx_encl *encl = file->private_data;
> > +	struct sgx_encl_mm *encl_mm;
> > +
> > +	for ( ; ; )  {
> > +		spin_lock(&encl->mm_lock);
> > +
> > +		if (list_empty(&encl->mm_list)) {
> > +			encl_mm = NULL;
> > +		} else {
> > +			encl_mm = list_first_entry(&encl->mm_list,
> > +						   struct sgx_encl_mm, list);
> > +			list_del_rcu(&encl_mm->list);
> > +		}
> > +
> > +		spin_unlock(&encl->mm_lock);
> > +
> > +		/* The list is empty, ready to go. */
> > +		if (!encl_mm)
> > +			break;
> > +
> > +		synchronize_srcu(&encl->srcu);
> > +		mmu_notifier_unregister(&encl_mm->mmu_notifier, encl_mm->mm);
> > +		kfree(encl_mm);
> > +	}
> > +
> > +	mutex_lock(&encl->lock);
> > +	atomic_or(SGX_ENCL_DEAD, &encl->flags);
> 
> So you set a flag that this is dead, and then instantly delete it?  Why
> does that matter?  I see you check for this flag elsewhere, but as you
> are just about to delete this structure, how can this be an issue?

It matters because ksgxswapd (sgx_reclaimer_*) might be processing it.

It will use the flag to skip the operations that it would do to a victim
page, when the enclave is still alive.

> 
> > +	mutex_unlock(&encl->lock);
> > +
> > +	kref_put(&encl->refcount, sgx_encl_release);
> 
> Don't you need to hold the lock across the put?  If not, what is
> serializing this?
> 
> But an even larger comment, why is this reference count needed at all?
> 
> You never grab it except at init time, and you free it at close time.
> Why not rely on the reference counting that the vfs ensures you?

Because ksgxswapd needs the alive enclave instance while it is in the
process of swapping a victim page. The reason for this is the
hierarchical nature of the enclave pages.

As an example, a write operation to main memory, EWB (SDM vol 3D 40-79)
needs to access SGX Enclave Control Structure (SECS) page, which is
contains global data for an enclave, like the unswapped child count.


> > +	return 0;
> > +}
> > +
> > +static int sgx_mmap(struct file *file, struct vm_area_struct *vma)
> > +{
> > +	struct sgx_encl *encl = file->private_data;
> > +	int ret;
> > +
> > +	ret = sgx_encl_may_map(encl, vma->vm_start, vma->vm_end, vma->vm_flags);
> > +	if (ret)
> > +		return ret;
> > +
> > +	ret = sgx_encl_mm_add(encl, vma->vm_mm);
> > +	if (ret)
> > +		return ret;
> > +
> > +	vma->vm_ops = &sgx_vm_ops;
> > +	vma->vm_flags |= VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | VM_IO;
> > +	vma->vm_private_data = encl;
> > +
> > +	return 0;
> > +}
> > +
> > +static unsigned long sgx_get_unmapped_area(struct file *file,
> > +					   unsigned long addr,
> > +					   unsigned long len,
> > +					   unsigned long pgoff,
> > +					   unsigned long flags)
> > +{
> > +	if ((flags & MAP_TYPE) == MAP_PRIVATE)
> > +		return -EINVAL;
> > +
> > +	if (flags & MAP_FIXED)
> > +		return addr;
> > +
> > +	return current->mm->get_unmapped_area(file, addr, len, pgoff, flags);
> > +}
> > +
> > +static const struct file_operations sgx_encl_fops = {
> > +	.owner			= THIS_MODULE,
> > +	.open			= sgx_open,
> > +	.release		= sgx_release,
> > +	.mmap			= sgx_mmap,
> > +	.get_unmapped_area	= sgx_get_unmapped_area,
> > +};
> > +
> > +static struct miscdevice sgx_dev_enclave = {
> > +	.minor = MISC_DYNAMIC_MINOR,
> > +	.name = "enclave",
> > +	.nodename = "sgx/enclave",
> 
> A subdir for a single device node?  Ok, odd, but why not just
> "sgx_enclave"?  How "special" is this device node?

There is a patch that adds "sgx/provision".

Either works for me. Should I flatten them to "sgx_enclave" and
"sgx_provision", or keep them as they are?

> thanks,
> 
> greg k-h

/Jarkko
Jarkko Sakkinen Oct. 4, 2020, 3:01 p.m. UTC | #4
On Sun, Oct 04, 2020 at 05:32:57PM +0300, Jarkko Sakkinen wrote:
> On Sat, Oct 03, 2020 at 04:39:25PM +0200, Greg KH wrote:
> > You use gpl-only header files in this file, so how in the world can it
> > be bsd-3 licensed?
> > 
> > Please get your legal department to agree with this, after you explain
> > to them how you are mixing gpl2-only code in with this file.
> 
> I'll do what I already stated that I will do. Should I do something
> more?

And forward this message to the aformentioned entity.

/Jarkko
Jarkko Sakkinen Oct. 4, 2020, 9:50 p.m. UTC | #5
On Sat, Oct 03, 2020 at 08:54:40PM +0100, Matthew Wilcox wrote:
> On Sat, Oct 03, 2020 at 07:50:46AM +0300, Jarkko Sakkinen wrote:
> > +	XA_STATE(xas, &encl->page_array, idx_start);
> > +
> > +	/*
> > +	 * Disallow READ_IMPLIES_EXEC tasks as their VMA permissions might
> > +	 * conflict with the enclave page permissions.
> > +	 */
> > +	if (current->personality & READ_IMPLIES_EXEC)
> > +		return -EACCES;
> > +
> > +	xas_for_each(&xas, page, idx_end)
> > +		if (!page || (~page->vm_max_prot_bits & vm_prot_bits))
> > +			return -EACCES;
> 
> You're iterating the array without holding any lock that the XArray knows
> about.  If you're OK with another thread adding/removing pages behind your
> back, or there's a higher level lock (the mmap_sem?) protecting the XArray
> from being modified while you walk it, then hold the rcu_read_lock()
> while walking the array.  Otherwise you can prevent modification by
> calling xas_lock(&xas) and xas_unlock()..

I backtracked this. The locks have been there from v21-v35. This is a
refactoring mistake in radix_tree to xarray migration happened in v36.
It's by no means intentional.

What is shoukd take is encl->lock.

The loop was pre-v36 like:

	idx_start = PFN_DOWN(start);
	idx_end = PFN_DOWN(end - 1);

	for (idx = idx_start; idx <= idx_end; ++idx) {
		mutex_lock(&encl->lock);
		page = radix_tree_lookup(&encl->page_tree, idx);
		mutex_unlock(&encl->lock);

		if (!page || (~page->vm_max_prot_bits & vm_prot_bits))
			return -EACCES;
	}

Looking at xarray.h and filemap.c, I'm thinking something along the
lines of:

	for (idx = idx_start; idx <= idx_end; ++idx) {
		mutex_lock(&encl->lock);
		page = xas_find(&xas, idx + 1);
		mutex_unlock(&encl->lock);

		if (!page || (~page->vm_max_prot_bits & vm_prot_bits))
			return -EACCES;
	}

Does this look about right?

/Jarkko
Jarkko Sakkinen Oct. 4, 2020, 10:02 p.m. UTC | #6
On Mon, Oct 05, 2020 at 12:51:00AM +0300, Jarkko Sakkinen wrote:
> On Sat, Oct 03, 2020 at 08:54:40PM +0100, Matthew Wilcox wrote:
> > On Sat, Oct 03, 2020 at 07:50:46AM +0300, Jarkko Sakkinen wrote:
> > > +	XA_STATE(xas, &encl->page_array, idx_start);
> > > +
> > > +	/*
> > > +	 * Disallow READ_IMPLIES_EXEC tasks as their VMA permissions might
> > > +	 * conflict with the enclave page permissions.
> > > +	 */
> > > +	if (current->personality & READ_IMPLIES_EXEC)
> > > +		return -EACCES;
> > > +
> > > +	xas_for_each(&xas, page, idx_end)
> > > +		if (!page || (~page->vm_max_prot_bits & vm_prot_bits))
> > > +			return -EACCES;
> > 
> > You're iterating the array without holding any lock that the XArray knows
> > about.  If you're OK with another thread adding/removing pages behind your
> > back, or there's a higher level lock (the mmap_sem?) protecting the XArray
> > from being modified while you walk it, then hold the rcu_read_lock()
> > while walking the array.  Otherwise you can prevent modification by
> > calling xas_lock(&xas) and xas_unlock()..
> 
> I backtracked this. The locks have been there from v21-v35. This is a
> refactoring mistake in radix_tree to xarray migration happened in v36.
> It's by no means intentional.
> 
> What is shoukd take is encl->lock.
> 
> The loop was pre-v36 like:
> 
> 	idx_start = PFN_DOWN(start);
> 	idx_end = PFN_DOWN(end - 1);
> 
> 	for (idx = idx_start; idx <= idx_end; ++idx) {
> 		mutex_lock(&encl->lock);
> 		page = radix_tree_lookup(&encl->page_tree, idx);
> 		mutex_unlock(&encl->lock);
> 
> 		if (!page || (~page->vm_max_prot_bits & vm_prot_bits))
> 			return -EACCES;
> 	}
> 
> Looking at xarray.h and filemap.c, I'm thinking something along the
> lines of:
> 
> 	for (idx = idx_start; idx <= idx_end; ++idx) {
> 		mutex_lock(&encl->lock);
> 		page = xas_find(&xas, idx + 1);
                                      ~~~~~~~
				      idx

> 		mutex_unlock(&encl->lock);
> 
> 		if (!page || (~page->vm_max_prot_bits & vm_prot_bits))
> 			return -EACCES;
> 	}
> 
> Does this look about right?

/Jarkko
Matthew Wilcox Oct. 4, 2020, 10:27 p.m. UTC | #7
On Mon, Oct 05, 2020 at 12:50:49AM +0300, Jarkko Sakkinen wrote:
> What is shoukd take is encl->lock.
> 
> The loop was pre-v36 like:
> 
> 	idx_start = PFN_DOWN(start);
> 	idx_end = PFN_DOWN(end - 1);
> 
> 	for (idx = idx_start; idx <= idx_end; ++idx) {
> 		mutex_lock(&encl->lock);
> 		page = radix_tree_lookup(&encl->page_tree, idx);
> 		mutex_unlock(&encl->lock);
> 
> 		if (!page || (~page->vm_max_prot_bits & vm_prot_bits))
> 			return -EACCES;
> 	}
> 
> Looking at xarray.h and filemap.c, I'm thinking something along the
> lines of:
> 
> 	for (idx = idx_start; idx <= idx_end; ++idx) {
> 		mutex_lock(&encl->lock);
> 		page = xas_find(&xas, idx + 1);
> 		mutex_unlock(&encl->lock);
> 
> 		if (!page || (~page->vm_max_prot_bits & vm_prot_bits))
> 			return -EACCES;
> 	}
> 
> Does this look about right?

Not really ...

	int ret = 0;

	mutex_lock(&encl->lock);
	rcu_read_lock();
	while (xas.index < idx_end) {
		page = xas_next(&xas);
		if (!page || (~page->vm_max_prot_bits & vm_prot_bits))
			ret = -EACCESS;
			break;
		}
	}
	rcu_read_unlock();
	mutex_unlock(&encl->lock);

	return ret;

... or you could rework to use the xa_lock instead of encl->lock.
I don't know how feasible that is for you.
Jarkko Sakkinen Oct. 4, 2020, 11:41 p.m. UTC | #8
On Sun, Oct 04, 2020 at 11:27:50PM +0100, Matthew Wilcox wrote:
> On Mon, Oct 05, 2020 at 12:50:49AM +0300, Jarkko Sakkinen wrote:
> > What is shoukd take is encl->lock.
> > 
> > The loop was pre-v36 like:
> > 
> > 	idx_start = PFN_DOWN(start);
> > 	idx_end = PFN_DOWN(end - 1);
> > 
> > 	for (idx = idx_start; idx <= idx_end; ++idx) {
> > 		mutex_lock(&encl->lock);
> > 		page = radix_tree_lookup(&encl->page_tree, idx);
> > 		mutex_unlock(&encl->lock);
> > 
> > 		if (!page || (~page->vm_max_prot_bits & vm_prot_bits))
> > 			return -EACCES;
> > 	}
> > 
> > Looking at xarray.h and filemap.c, I'm thinking something along the
> > lines of:
> > 
> > 	for (idx = idx_start; idx <= idx_end; ++idx) {
> > 		mutex_lock(&encl->lock);
> > 		page = xas_find(&xas, idx + 1);
> > 		mutex_unlock(&encl->lock);
> > 
> > 		if (!page || (~page->vm_max_prot_bits & vm_prot_bits))
> > 			return -EACCES;
> > 	}
> > 
> > Does this look about right?
> 
> Not really ...
> 
> 	int ret = 0;
> 
> 	mutex_lock(&encl->lock);
> 	rcu_read_lock();

Right, so xa_*() take RCU lock implicitly and xas_* do not.

> 	while (xas.index < idx_end) {
> 		page = xas_next(&xas);

It should iterate through every possible page index within the range,
even the ones that do not have an entry, i.e. this loop also checks
that there are no empty slots.

Does xas_next() go through every possible index, or skip the non-empty
ones?

> 		if (!page || (~page->vm_max_prot_bits & vm_prot_bits))
> 			ret = -EACCESS;
> 			break;
> 		}
> 	}
> 	rcu_read_unlock();
> 	mutex_unlock(&encl->lock);

In my Geminilake NUC the maximum size of the address space is 64GB for
an enclave, and it is not fixed but can grow in microarchitectures
beyond that.

That means that in (*artificial*) worst case the locks would be kept for
64*1024*1024*1024/4096 = 16777216 iterations.

I just realized that in sgx_encl_load_page ([1], the encl->lock is
acquired by the caller) I have used xa_load(), which more or less would
be compatible with the old radix_tree pattern, i.e.

for (idx = idx_start; idx <= idx_end; ++idx) {
	mutex_lock(&encl->lock);
	page = xas_load(&encl->page_array, idx);
	mutex_unlock(&encl->lock);

	if (!page || (~page->vm_max_prot_bits & vm_prot_bits))
		return -EACCES;
}

To make things stable again, I'll go with this for the immediate future.

> 	return ret;
> 
> ... or you could rework to use the xa_lock instead of encl->lock.
> I don't know how feasible that is for you.

encl->lock is used to protect enclave state but it is true that
page->vm_max_prort_bits is not modified through concurrent access, once
the page is added (e.g. by the reclaimer, which gets pages through
sgx_activate_page_list, not through xarray).

It's an interesting idea, but before even considering it I want to fix
the bug, even if the fix ought to be somehow unoptimal in terms of
performance.

Thanks for helping with this. xarray is still somewhat alien to me and
most of the code I see just use the iterator macros excep mm/*, but
I'm slowly adapting the concepts.

[1] https://git.kernel.org/pub/scm/linux/kernel/git/jarkko/linux-sgx.git/tree/arch/x86/kernel/cpu/sgx/encl.c
[2] https://git.kernel.org/pub/scm/linux/kernel/git/jarkko/linux-sgx.git/tree/arch/x86/kernel/cpu/sgx/main.c

/Jarkko
Matthew Wilcox Oct. 5, 2020, 1:30 a.m. UTC | #9
On Mon, Oct 05, 2020 at 02:41:53AM +0300, Jarkko Sakkinen wrote:
> On Sun, Oct 04, 2020 at 11:27:50PM +0100, Matthew Wilcox wrote:
> > 	int ret = 0;
> > 
> > 	mutex_lock(&encl->lock);
> > 	rcu_read_lock();
> 
> Right, so xa_*() take RCU lock implicitly and xas_* do not.

Not necessarily the RCU lock ... I did document all this in xarray.rst:

https://www.kernel.org/doc/html/latest/core-api/xarray.html

> > 	while (xas.index < idx_end) {
> > 		page = xas_next(&xas);
> 
> It should iterate through every possible page index within the range,
> even the ones that do not have an entry, i.e. this loop also checks
> that there are no empty slots.
> 
> Does xas_next() go through every possible index, or skip the non-empty
> ones?

xas_next(), as its documentation says, will move to the next array
index:

https://www.kernel.org/doc/html/latest/core-api/xarray.html#c.xas_next

> > 		if (!page || (~page->vm_max_prot_bits & vm_prot_bits))
> > 			ret = -EACCESS;
> > 			break;
> > 		}
> > 	}
> > 	rcu_read_unlock();
> > 	mutex_unlock(&encl->lock);
> 
> In my Geminilake NUC the maximum size of the address space is 64GB for
> an enclave, and it is not fixed but can grow in microarchitectures
> beyond that.
> 
> That means that in (*artificial*) worst case the locks would be kept for
> 64*1024*1024*1024/4096 = 16777216 iterations.

Oh, there's support for that on the XArray API too.

        xas_lock_irq(&xas);
        xas_for_each_marked(&xas, page, end, PAGECACHE_TAG_DIRTY) {
                xas_set_mark(&xas, PAGECACHE_TAG_TOWRITE);
                if (++tagged % XA_CHECK_SCHED)
                        continue;

                xas_pause(&xas);
                xas_unlock_irq(&xas);
                cond_resched();
                xas_lock_irq(&xas);
        }
        xas_unlock_irq(&xas);
Jarkko Sakkinen Oct. 5, 2020, 3:06 a.m. UTC | #10
On Mon, Oct 05, 2020 at 02:30:53AM +0100, Matthew Wilcox wrote:
> > In my Geminilake NUC the maximum size of the address space is 64GB for
> > an enclave, and it is not fixed but can grow in microarchitectures
> > beyond that.
> > 
> > That means that in (*artificial*) worst case the locks would be kept for
> > 64*1024*1024*1024/4096 = 16777216 iterations.
> 
> Oh, there's support for that on the XArray API too.
> 
>         xas_lock_irq(&xas);
>         xas_for_each_marked(&xas, page, end, PAGECACHE_TAG_DIRTY) {
>                 xas_set_mark(&xas, PAGECACHE_TAG_TOWRITE);
>                 if (++tagged % XA_CHECK_SCHED)
>                         continue;
> 
>                 xas_pause(&xas);
>                 xas_unlock_irq(&xas);
>                 cond_resched();
>                 xas_lock_irq(&xas);
>         }
>         xas_unlock_irq(&xas);

Assuming we can iterate the array without encl->lock, I think this
would translate to:

/*
 * Not taking encl->lock because:
 * 1. page attributes are not written.
 * 2. the only page attribute read is set before it is put to the array
 *    and stays constant throughout the enclave life-cycle.
 */
xas_lock(&xas);
xas_for_each_marked(&xas, page, idx_end) {
	if (++tagged % XA_CHECK_SCHED)
		continue;

	xas_pause(&xas);
	xas_unlock(&xas);

	/*
	 * Attributes are not protected by the xa_lock, so I'm assuming
	 * that this is the legit place for the check.
	 */
	if (!page || (~page->vm_max_prot_bits & vm_prot_bits))
		return -EACCES;

	cond_resched();
 	xas_lock(&xas);
}
xas_unlock(&xas);

Obviously, we cannot use this pattern by taking the encl->lock inside
the loop (ABBA and encl->lock is a mutex).

Let's enumerate:

A. sgx_encl_add_page(): uses xa_insert() and xa_erase().
B. sgx_encl_load_page(): uses xa_load().
C. sgx_encl_may_map(): is broken (for the moment).

A and B implicitly the lock and if a page exist at all we only access
a pure constant.

Also, since the open file keeps the instance alive, nobody is going
to pull carpet under our feet.

OK, I've just concluded tha we don't need to take encl->lock in this
case. Great.

/Jarkko
Christoph Hellwig Oct. 5, 2020, 8:45 a.m. UTC | #11
On Sat, Oct 03, 2020 at 04:39:25PM +0200, Greg KH wrote:
> > @@ -0,0 +1,173 @@
> > +// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
> 
> You use gpl-only header files in this file, so how in the world can it
> be bsd-3 licensed?
> 
> Please get your legal department to agree with this, after you explain
> to them how you are mixing gpl2-only code in with this file.
> 
> > +// Copyright(c) 2016-18 Intel Corporation.
> 
> Dates are hard to get right :(

As is comment formatting apparently.  Don't use // comments for anything
but the SPDX header, please.
Greg KH Oct. 5, 2020, 9:42 a.m. UTC | #12
On Sun, Oct 04, 2020 at 05:32:46PM +0300, Jarkko Sakkinen wrote:
> On Sat, Oct 03, 2020 at 04:39:25PM +0200, Greg KH wrote:
> > On Sat, Oct 03, 2020 at 07:50:46AM +0300, Jarkko Sakkinen wrote:
> > > Intel Software Guard eXtensions (SGX) is a set of CPU instructions that can
> > > be used by applications to set aside private regions of code and data. The
> > > code outside the SGX hosted software entity is prevented from accessing the
> > > memory inside the enclave by the CPU. We call these entities enclaves.
> > > 
> > > Add a driver that provides an ioctl API to construct and run enclaves.
> > > Enclaves are constructed from pages residing in reserved physical memory
> > > areas. The contents of these pages can only be accessed when they are
> > > mapped as part of an enclave, by a hardware thread running inside the
> > > enclave.
> > > 
> > > The starting state of an enclave consists of a fixed measured set of
> > > pages that are copied to the EPC during the construction process by
> > > using the opcode ENCLS leaf functions and Software Enclave Control
> > > Structure (SECS) that defines the enclave properties.
> > > 
> > > Enclaves are constructed by using ENCLS leaf functions ECREATE, EADD and
> > > EINIT. ECREATE initializes SECS, EADD copies pages from system memory to
> > > the EPC and EINIT checks a given signed measurement and moves the enclave
> > > into a state ready for execution.
> > > 
> > > An initialized enclave can only be accessed through special Thread Control
> > > Structure (TCS) pages by using ENCLU (ring-3 only) leaf EENTER.  This leaf
> > > function converts a thread into enclave mode and continues the execution in
> > > the offset defined by the TCS provided to EENTER. An enclave is exited
> > > through syscall, exception, interrupts or by explicitly calling another
> > > ENCLU leaf EEXIT.
> > > 
> > > The mmap() permissions are capped by the contained enclave page
> > > permissions. The mapped areas must also be populated, i.e. each page
> > > address must contain a page. This logic is implemented in
> > > sgx_encl_may_map().
> > > 
> > > Cc: linux-security-module@vger.kernel.org
> > > Cc: linux-mm@kvack.org
> > > Cc: Andrew Morton <akpm@linux-foundation.org>
> > > Cc: Matthew Wilcox <willy@infradead.org>
> > > Acked-by: Jethro Beekman <jethro@fortanix.com>
> > > Tested-by: Jethro Beekman <jethro@fortanix.com>
> > > Tested-by: Haitao Huang <haitao.huang@linux.intel.com>
> > > Tested-by: Chunyang Hui <sanqian.hcy@antfin.com>
> > > Tested-by: Jordan Hand <jorhand@linux.microsoft.com>
> > > Tested-by: Nathaniel McCallum <npmccallum@redhat.com>
> > > Tested-by: Seth Moore <sethmo@google.com>
> > > Tested-by: Darren Kenny <darren.kenny@oracle.com>
> > > Reviewed-by: Darren Kenny <darren.kenny@oracle.com>
> > > Co-developed-by: Sean Christopherson <sean.j.christopherson@intel.com>
> > > Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
> > > Co-developed-by: Suresh Siddha <suresh.b.siddha@intel.com>
> > > Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
> > > Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
> > > ---
> > >  arch/x86/kernel/cpu/sgx/Makefile |   2 +
> > >  arch/x86/kernel/cpu/sgx/driver.c | 173 ++++++++++++++++
> > >  arch/x86/kernel/cpu/sgx/driver.h |  29 +++
> > >  arch/x86/kernel/cpu/sgx/encl.c   | 331 +++++++++++++++++++++++++++++++
> > >  arch/x86/kernel/cpu/sgx/encl.h   |  85 ++++++++
> > >  arch/x86/kernel/cpu/sgx/main.c   |  11 +
> > >  6 files changed, 631 insertions(+)
> > >  create mode 100644 arch/x86/kernel/cpu/sgx/driver.c
> > >  create mode 100644 arch/x86/kernel/cpu/sgx/driver.h
> > >  create mode 100644 arch/x86/kernel/cpu/sgx/encl.c
> > >  create mode 100644 arch/x86/kernel/cpu/sgx/encl.h
> > > 
> > > diff --git a/arch/x86/kernel/cpu/sgx/Makefile b/arch/x86/kernel/cpu/sgx/Makefile
> > > index 79510ce01b3b..3fc451120735 100644
> > > --- a/arch/x86/kernel/cpu/sgx/Makefile
> > > +++ b/arch/x86/kernel/cpu/sgx/Makefile
> > > @@ -1,2 +1,4 @@
> > >  obj-y += \
> > > +	driver.o \
> > > +	encl.o \
> > >  	main.o
> > > diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c
> > > new file mode 100644
> > > index 000000000000..f54da5f19c2b
> > > --- /dev/null
> > > +++ b/arch/x86/kernel/cpu/sgx/driver.c
> > > @@ -0,0 +1,173 @@
> > > +// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
> > 
> > You use gpl-only header files in this file, so how in the world can it
> > be bsd-3 licensed?
> > 
> > Please get your legal department to agree with this, after you explain
> > to them how you are mixing gpl2-only code in with this file.
> 
> I'll do what I already stated that I will do. Should I do something
> more?

This was written before your previous response.

> > > +	mutex_lock(&encl->lock);
> > > +	atomic_or(SGX_ENCL_DEAD, &encl->flags);
> > 
> > So you set a flag that this is dead, and then instantly delete it?  Why
> > does that matter?  I see you check for this flag elsewhere, but as you
> > are just about to delete this structure, how can this be an issue?
> 
> It matters because ksgxswapd (sgx_reclaimer_*) might be processing it.

I don't see that happening in this patch, did I miss it?

> It will use the flag to skip the operations that it would do to a victim
> page, when the enclave is still alive.

Again, why are you adding flags when the patch does not use them?
Please put new functionality in the specific patch that uses it.

And can you really rely on this?  How did sgx_reclaimer_* (whatever that
is), get the reference on this object in the first place?  Again, I
don't see that happening at all in here, and at a quick glance in the
other patches I don't see it there either.  What am I missing?

> > > +	mutex_unlock(&encl->lock);
> > > +
> > > +	kref_put(&encl->refcount, sgx_encl_release);
> > 
> > Don't you need to hold the lock across the put?  If not, what is
> > serializing this?
> > 
> > But an even larger comment, why is this reference count needed at all?
> > 
> > You never grab it except at init time, and you free it at close time.
> > Why not rely on the reference counting that the vfs ensures you?
> 
> Because ksgxswapd needs the alive enclave instance while it is in the
> process of swapping a victim page. The reason for this is the
> hierarchical nature of the enclave pages.
> 
> As an example, a write operation to main memory, EWB (SDM vol 3D 40-79)

What is that referencing?

> needs to access SGX Enclave Control Structure (SECS) page, which is
> contains global data for an enclave, like the unswapped child count.

Ok, but how did it get access to this structure in the first place, like
I ask above?

> > > +	return 0;
> > > +}
> > > +
> > > +static int sgx_mmap(struct file *file, struct vm_area_struct *vma)
> > > +{
> > > +	struct sgx_encl *encl = file->private_data;
> > > +	int ret;
> > > +
> > > +	ret = sgx_encl_may_map(encl, vma->vm_start, vma->vm_end, vma->vm_flags);
> > > +	if (ret)
> > > +		return ret;
> > > +
> > > +	ret = sgx_encl_mm_add(encl, vma->vm_mm);
> > > +	if (ret)
> > > +		return ret;
> > > +
> > > +	vma->vm_ops = &sgx_vm_ops;
> > > +	vma->vm_flags |= VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | VM_IO;
> > > +	vma->vm_private_data = encl;
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +static unsigned long sgx_get_unmapped_area(struct file *file,
> > > +					   unsigned long addr,
> > > +					   unsigned long len,
> > > +					   unsigned long pgoff,
> > > +					   unsigned long flags)
> > > +{
> > > +	if ((flags & MAP_TYPE) == MAP_PRIVATE)
> > > +		return -EINVAL;
> > > +
> > > +	if (flags & MAP_FIXED)
> > > +		return addr;
> > > +
> > > +	return current->mm->get_unmapped_area(file, addr, len, pgoff, flags);
> > > +}
> > > +
> > > +static const struct file_operations sgx_encl_fops = {
> > > +	.owner			= THIS_MODULE,
> > > +	.open			= sgx_open,
> > > +	.release		= sgx_release,
> > > +	.mmap			= sgx_mmap,
> > > +	.get_unmapped_area	= sgx_get_unmapped_area,
> > > +};
> > > +
> > > +static struct miscdevice sgx_dev_enclave = {
> > > +	.minor = MISC_DYNAMIC_MINOR,
> > > +	.name = "enclave",
> > > +	.nodename = "sgx/enclave",
> > 
> > A subdir for a single device node?  Ok, odd, but why not just
> > "sgx_enclave"?  How "special" is this device node?
> 
> There is a patch that adds "sgx/provision".

What number in this series?

> Either works for me. Should I flatten them to "sgx_enclave" and
> "sgx_provision", or keep them as they are?

Having 2 char nodes in a subdir is better than one, I will give you
that.  But none is even better, don't you think?

thanks,

greg k-h
Jarkko Sakkinen Oct. 5, 2020, 11:42 a.m. UTC | #13
On Mon, Oct 05, 2020 at 09:45:54AM +0100, Christoph Hellwig wrote:
> On Sat, Oct 03, 2020 at 04:39:25PM +0200, Greg KH wrote:
> > > @@ -0,0 +1,173 @@
> > > +// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
> > 
> > You use gpl-only header files in this file, so how in the world can it
> > be bsd-3 licensed?
> > 
> > Please get your legal department to agree with this, after you explain
> > to them how you are mixing gpl2-only code in with this file.
> > 
> > > +// Copyright(c) 2016-18 Intel Corporation.
> > 
> > Dates are hard to get right :(
> 
> As is comment formatting apparently.  Don't use // comments for anything
> but the SPDX header, please.

I'll bring some context to this.

When I moved into using SPDX, I took the example from places where I saw
also the copyright using "//". That's the reason for the choice.

I.e.

$ git grep "// Copyright" | wc -l
2123

I don't care, which one to use, just wondering is it done in the wrong
way in all these sites?

/Jarkko
Greg KH Oct. 5, 2020, 11:50 a.m. UTC | #14
On Mon, Oct 05, 2020 at 02:42:50PM +0300, Jarkko Sakkinen wrote:
> On Mon, Oct 05, 2020 at 09:45:54AM +0100, Christoph Hellwig wrote:
> > On Sat, Oct 03, 2020 at 04:39:25PM +0200, Greg KH wrote:
> > > > @@ -0,0 +1,173 @@
> > > > +// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
> > > 
> > > You use gpl-only header files in this file, so how in the world can it
> > > be bsd-3 licensed?
> > > 
> > > Please get your legal department to agree with this, after you explain
> > > to them how you are mixing gpl2-only code in with this file.
> > > 
> > > > +// Copyright(c) 2016-18 Intel Corporation.
> > > 
> > > Dates are hard to get right :(
> > 
> > As is comment formatting apparently.  Don't use // comments for anything
> > but the SPDX header, please.
> 
> I'll bring some context to this.
> 
> When I moved into using SPDX, I took the example from places where I saw
> also the copyright using "//". That's the reason for the choice.
> 
> I.e.
> 
> $ git grep "// Copyright" | wc -l
> 2123
> 
> I don't care, which one to use, just wondering is it done in the wrong
> way in all these sites?

Probably, but I know at least one subsystem requires their headers to be
in this manner.  There's no accounting for taste :)

thanks,

greg k-h
Jarkko Sakkinen Oct. 5, 2020, 12:42 p.m. UTC | #15
On Mon, Oct 05, 2020 at 11:42:46AM +0200, Greg KH wrote:
> > > You use gpl-only header files in this file, so how in the world can it
> > > be bsd-3 licensed?
> > > 
> > > Please get your legal department to agree with this, after you explain
> > > to them how you are mixing gpl2-only code in with this file.
> > 
> > I'll do what I already stated that I will do. Should I do something
> > more?
> 
> This was written before your previous response.

OK, that is weird, I got this one some time later.

> > > > +	mutex_lock(&encl->lock);
> > > > +	atomic_or(SGX_ENCL_DEAD, &encl->flags);
> > > 
> > > So you set a flag that this is dead, and then instantly delete it?  Why
> > > does that matter?  I see you check for this flag elsewhere, but as you
> > > are just about to delete this structure, how can this be an issue?
> > 
> > It matters because ksgxswapd (sgx_reclaimer_*) might be processing it.
> 
> I don't see that happening in this patch, did I miss it?

It's implemented in 16/24:

https://lore.kernel.org/linux-sgx/20201004223921.GA48517@linux.intel.com/T/#u

> > It will use the flag to skip the operations that it would do to a victim
> > page, when the enclave is still alive.
> 
> Again, why are you adding flags when the patch does not use them?
> Please put new functionality in the specific patch that uses it.
> 
> And can you really rely on this?  How did sgx_reclaimer_* (whatever that
> is), get the reference on this object in the first place?  Again, I
> don't see that happening at all in here, and at a quick glance in the
> other patches I don't see it there either.  What am I missing?

I went through the patch, and yes, they can be migrated to 16/24.
I agree with this, no excuses.

In 16/24 pages are added to sgx_active_page_list from which they are
swapped by the reclaimer to the main memory when Enclave Page Cache
(EPC), the memory where enclave pages reside, gets full.

When a reclaimer thread takes a victim page from that list, it will also
get a kref to the enclave so that struct sgx_encl instance does not
get wiped while it's doing its job.

> > Because ksgxswapd needs the alive enclave instance while it is in the
> > process of swapping a victim page. The reason for this is the
> > hierarchical nature of the enclave pages.
> > 
> > As an example, a write operation to main memory, EWB (SDM vol 3D 40-79)
> 
> What is that referencing?

https://software.intel.com/content/dam/develop/public/us/en/documents/332831-sdm-vol-3d.pdf

> > needs to access SGX Enclave Control Structure (SECS) page, which is
> > contains global data for an enclave, like the unswapped child count.
> 
> Ok, but how did it get access to this structure in the first place, like
> I ask above?

I guess I answered that, and I also fully agree with your suggestions.

It used to be many iterations ago that enclaves were not file based but
just memory mappings (long story short: was not great way to make them
multiprocess, that's why file centered now), and then refcount played a
bigger role. Having those "extras" in this patch is by no means
intentional but more like cruft of many iterations of refactoring.

Sometimes when you work long with this kind of pile of code, which has
converged through many iterations, you really need someone else to point
some of the simple and obvious things out.

> > There is a patch that adds "sgx/provision".
> 
> What number in this series?

It's 15/24.

> 
> > Either works for me. Should I flatten them to "sgx_enclave" and
> > "sgx_provision", or keep them as they are?
> 
> Having 2 char nodes in a subdir is better than one, I will give you
> that.  But none is even better, don't you think?

I think that having just "sgx_enclave" and "sgx_provision" would be
better.

I've been thinking about this for a while but at the same time try not
to be too proactive without feedback. One reason would be that "enclave"
and "provision" without the subdir are not good identifiers.

I also recalled this discussion:

https://lkml.org/lkml/2019/12/23/158

and was wondering how that subdir would even play with /sys/class/misc,
if we decide to add attributes? Not enough knowledge to answer this.

Anyway, I'll put a note to my backlog on this, and also to move the
previously discussed cruft to the correct patch.

> thanks,
> 
> greg k-h

Thank you.

/Jarkko
Jarkko Sakkinen Oct. 5, 2020, 2:23 p.m. UTC | #16
On Mon, Oct 05, 2020 at 01:50:30PM +0200, Greg KH wrote:
> On Mon, Oct 05, 2020 at 02:42:50PM +0300, Jarkko Sakkinen wrote:
> > On Mon, Oct 05, 2020 at 09:45:54AM +0100, Christoph Hellwig wrote:
> > > On Sat, Oct 03, 2020 at 04:39:25PM +0200, Greg KH wrote:
> > > > > @@ -0,0 +1,173 @@
> > > > > +// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
> > > > 
> > > > You use gpl-only header files in this file, so how in the world can it
> > > > be bsd-3 licensed?
> > > > 
> > > > Please get your legal department to agree with this, after you explain
> > > > to them how you are mixing gpl2-only code in with this file.
> > > > 
> > > > > +// Copyright(c) 2016-18 Intel Corporation.
> > > > 
> > > > Dates are hard to get right :(
> > > 
> > > As is comment formatting apparently.  Don't use // comments for anything
> > > but the SPDX header, please.
> > 
> > I'll bring some context to this.
> > 
> > When I moved into using SPDX, I took the example from places where I saw
> > also the copyright using "//". That's the reason for the choice.
> > 
> > I.e.
> > 
> > $ git grep "// Copyright" | wc -l
> > 2123
> > 
> > I don't care, which one to use, just wondering is it done in the wrong
> > way in all these sites?
> 
> Probably, but I know at least one subsystem requires their headers to be
> in this manner.  There's no accounting for taste :)

This discussion is a bit confusing [*], so I'll just ask from Git:

➜  linux-sgx (master) ✔ git --no-pager grep "\/\/ Copyright" arch/x86
arch/x86/kernel/cpu/sgx/driver.c:// Copyright(c) 2016-20 Intel Corporation.
arch/x86/kernel/cpu/sgx/encl.c:// Copyright(c) 2016-20 Intel Corporation.
arch/x86/kernel/cpu/sgx/ioctl.c:// Copyright(c) 2016-20 Intel Corporation.
arch/x86/kernel/cpu/sgx/main.c:// Copyright(c) 2016-20 Intel Corporation.

OK, now I think I know what to do :-)

> thanks,
> 
> greg k-h

[*] One thing I've been wondering for a long time is that, why new code
should have the copyright platters in the first place? I get it for
pre-Git era but now there is a cryptographic log of authority.

Copyright platters, remarking the authors to the header and
MODULE_AUTHOR() macro are the three things that I just do not get in the
modern times.

/Jarkko
Greg KH Oct. 5, 2020, 3:02 p.m. UTC | #17
On Mon, Oct 05, 2020 at 05:23:45PM +0300, Jarkko Sakkinen wrote:
> [*] One thing I've been wondering for a long time is that, why new code
> should have the copyright platters in the first place? I get it for
> pre-Git era but now there is a cryptographic log of authority.

Go talk to your corporate lawyers about this, it is one of the most
common cargo-cult patterns around :)

good luck!

greg k-h
Dave Hansen Oct. 5, 2020, 4:40 p.m. UTC | #18
On 10/5/20 8:02 AM, Greg KH wrote:
> On Mon, Oct 05, 2020 at 05:23:45PM +0300, Jarkko Sakkinen wrote:
>> [*] One thing I've been wondering for a long time is that, why new code
>> should have the copyright platters in the first place? I get it for
>> pre-Git era but now there is a cryptographic log of authority.
> Go talk to your corporate lawyers about this, it is one of the most
> common cargo-cult patterns around :)

For this patch, though, it seems like we should just update the dates
instead of removing them.

If I look at the last 1000 "^+.*Copyright" lines added to the kernel,
997 of them have a year.  So, weird or not, it's a pretty standard
convention.  We'd need a slightly more broad conversation before we
decide to nix these dates.

Pure speculation: Copyright protection, at least in the US, is not
forever.  I _think_ it's 75 years or something.  That protection starts
when the work is created and is independent of when it gets merged into
Linux.  So, if we did something weird like merge a driver written 10
years ago, it would only be protected for 65 more years after we merge
it.  In other words, git history _might_ be irrelevant for copyright
protection.
Jarkko Sakkinen Oct. 5, 2020, 8:02 p.m. UTC | #19
On Mon, Oct 05, 2020 at 09:40:52AM -0700, Dave Hansen wrote:
> On 10/5/20 8:02 AM, Greg KH wrote:
> > On Mon, Oct 05, 2020 at 05:23:45PM +0300, Jarkko Sakkinen wrote:
> >> [*] One thing I've been wondering for a long time is that, why new code
> >> should have the copyright platters in the first place? I get it for
> >> pre-Git era but now there is a cryptographic log of authority.
> > Go talk to your corporate lawyers about this, it is one of the most
> > common cargo-cult patterns around :)
> 
> For this patch, though, it seems like we should just update the dates
> instead of removing them.

Already done. I updated them yesterday as:

  Copyright(c) 2016-20 Intel Corporation.

Changing from '//' to '/* ... */' is not yet.

> If I look at the last 1000 "^+.*Copyright" lines added to the kernel,
> 997 of them have a year.  So, weird or not, it's a pretty standard
> convention.  We'd need a slightly more broad conversation before we
> decide to nix these dates.
> 
> Pure speculation: Copyright protection, at least in the US, is not
> forever.  I _think_ it's 75 years or something.  That protection starts
> when the work is created and is independent of when it gets merged into
> Linux.  So, if we did something weird like merge a driver written 10
> years ago, it would only be protected for 65 more years after we merge
> it.  In other words, git history _might_ be irrelevant for copyright
> protection.

/Jarkko
Haitao Huang Oct. 7, 2020, 6:09 p.m. UTC | #20
On Mon, 05 Oct 2020 07:42:21 -0500, Jarkko Sakkinen  
<jarkko.sakkinen@linux.intel.com> wrote:

> On Mon, Oct 05, 2020 at 11:42:46AM +0200, Greg KH wrote:
>> > > You use gpl-only header files in this file, so how in the world can  
>> it
>> > > be bsd-3 licensed?
>> > >
>> > > Please get your legal department to agree with this, after you  
>> explain
>> > > to them how you are mixing gpl2-only code in with this file.
>> >
>> > I'll do what I already stated that I will do. Should I do something
>> > more?
>>
>> This was written before your previous response.
>
> OK, that is weird, I got this one some time later.
>
>> > > > +	mutex_lock(&encl->lock);
>> > > > +	atomic_or(SGX_ENCL_DEAD, &encl->flags);
>> > >
>> > > So you set a flag that this is dead, and then instantly delete it?   
>> Why
>> > > does that matter?  I see you check for this flag elsewhere, but as  
>> you
>> > > are just about to delete this structure, how can this be an issue?
>> >
>> > It matters because ksgxswapd (sgx_reclaimer_*) might be processing it.
>>
>> I don't see that happening in this patch, did I miss it?
>
> It's implemented in 16/24:
>
> https://lore.kernel.org/linux-sgx/20201004223921.GA48517@linux.intel.com/T/#u
>
>> > It will use the flag to skip the operations that it would do to a  
>> victim
>> > page, when the enclave is still alive.
>>
>> Again, why are you adding flags when the patch does not use them?
>> Please put new functionality in the specific patch that uses it.
>>
>> And can you really rely on this?  How did sgx_reclaimer_* (whatever that
>> is), get the reference on this object in the first place?  Again, I
>> don't see that happening at all in here, and at a quick glance in the
>> other patches I don't see it there either.  What am I missing?
>
> I went through the patch, and yes, they can be migrated to 16/24.
> I agree with this, no excuses.
>
> In 16/24 pages are added to sgx_active_page_list from which they are
> swapped by the reclaimer to the main memory when Enclave Page Cache
> (EPC), the memory where enclave pages reside, gets full.
>
> When a reclaimer thread takes a victim page from that list, it will also
> get a kref to the enclave so that struct sgx_encl instance does not
> get wiped while it's doing its job.
>
>> > Because ksgxswapd needs the alive enclave instance while it is in the
>> > process of swapping a victim page. The reason for this is the
>> > hierarchical nature of the enclave pages.
>> >
>> > As an example, a write operation to main memory, EWB (SDM vol 3D  
>> 40-79)
>>
>> What is that referencing?
>
> https://software.intel.com/content/dam/develop/public/us/en/documents/332831-sdm-vol-3d.pdf
>
>> > needs to access SGX Enclave Control Structure (SECS) page, which is
>> > contains global data for an enclave, like the unswapped child count.
>>
>> Ok, but how did it get access to this structure in the first place, like
>> I ask above?
>
> I guess I answered that, and I also fully agree with your suggestions.
>
> It used to be many iterations ago that enclaves were not file based but
> just memory mappings (long story short: was not great way to make them
> multiprocess, that's why file centered now), and then refcount played a
> bigger role. Having those "extras" in this patch is by no means
> intentional but more like cruft of many iterations of refactoring.
>
> Sometimes when you work long with this kind of pile of code, which has
> converged through many iterations, you really need someone else to point
> some of the simple and obvious things out.
>
>> > There is a patch that adds "sgx/provision".
>>
>> What number in this series?
>
> It's 15/24.
>

Don't know if this is critical. I'd prefer to keep them as is. Directory  
seems natural to me and makes sense to add more under the same dir in case  
there are more to come.

Thanks
Haitao
Greg KH Oct. 7, 2020, 7:26 p.m. UTC | #21
On Wed, Oct 07, 2020 at 01:09:01PM -0500, Haitao Huang wrote:
> > > > There is a patch that adds "sgx/provision".
> > > 
> > > What number in this series?
> > 
> > It's 15/24.
> > 
> 
> Don't know if this is critical. I'd prefer to keep them as is. Directory
> seems natural to me and makes sense to add more under the same dir in case
> there are more to come.

Why is this so special that you need a subdirectory for a single driver
with a mere 2 device nodes?  Do any other misc drivers have a new
subdirectory in /dev/ for them?

thanks,

greg k-h
Jarkko Sakkinen Oct. 9, 2020, 6:44 a.m. UTC | #22
On Wed, Oct 07, 2020 at 09:26:55PM +0200, Greg KH wrote:
> On Wed, Oct 07, 2020 at 01:09:01PM -0500, Haitao Huang wrote:
> > > > > There is a patch that adds "sgx/provision".
> > > > 
> > > > What number in this series?
> > > 
> > > It's 15/24.
> > > 
> > 
> > Don't know if this is critical. I'd prefer to keep them as is. Directory
> > seems natural to me and makes sense to add more under the same dir in case
> > there are more to come.
> 
> Why is this so special that you need a subdirectory for a single driver
> with a mere 2 device nodes?  Do any other misc drivers have a new
> subdirectory in /dev/ for them?

Absolutely nothing as far as I'm concerned. Should have done that
already at the time when I switched to misc based on your feedback. I
was acting too reactive I guess. For sure I'll rename.

I also looked at encl->refcount with time. Instead of just "moving the
garbage up to the correct waste pit", I'll address that one by
refactoring it out and making the reclaimer thread to do the reaper's
job.

> thanks,
> 
> greg k-h

/Jarkko
Pavel Machek Oct. 9, 2020, 7:10 a.m. UTC | #23
Hi!

> > new file mode 100644
> > index 000000000000..f54da5f19c2b
> > --- /dev/null
> > +++ b/arch/x86/kernel/cpu/sgx/driver.c
> > @@ -0,0 +1,173 @@
> > +// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
> 
> You use gpl-only header files in this file, so how in the world can it
> be bsd-3 licensed?
> 
> Please get your legal department to agree with this, after you explain
> to them how you are mixing gpl2-only code in with this file.

This specifies license of driver.c, not of the headers included. Are
you saying that it is impossible to have a kernel driver with anything
else than GPL-2? That would be news to many, and that's not what
current consensus is.

									Pavel
Greg KH Oct. 9, 2020, 7:21 a.m. UTC | #24
On Fri, Oct 09, 2020 at 09:10:45AM +0200, Pavel Machek wrote:
> Hi!
> 
> > > new file mode 100644
> > > index 000000000000..f54da5f19c2b
> > > --- /dev/null
> > > +++ b/arch/x86/kernel/cpu/sgx/driver.c
> > > @@ -0,0 +1,173 @@
> > > +// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
> > 
> > You use gpl-only header files in this file, so how in the world can it
> > be bsd-3 licensed?
> > 
> > Please get your legal department to agree with this, after you explain
> > to them how you are mixing gpl2-only code in with this file.
> 
> This specifies license of driver.c, not of the headers included. Are
> you saying that it is impossible to have a kernel driver with anything
> else than GPL-2? That would be news to many, and that's not what
> current consensus is.

If you want to write any non-GPL-2-only kernel code, you had better be
consulting your lawyers and get very explicit instructions on how to do
this in a way that does not violate any licenses.

I am not a lawyer, and will not be giving you any such advice, as I
think it's not something that people should be doing.

greg k-h
Pavel Machek Oct. 9, 2020, 8:21 a.m. UTC | #25
On Fri 2020-10-09 09:21:41, Greg KH wrote:
> On Fri, Oct 09, 2020 at 09:10:45AM +0200, Pavel Machek wrote:
> > Hi!
> > 
> > > > new file mode 100644
> > > > index 000000000000..f54da5f19c2b
> > > > --- /dev/null
> > > > +++ b/arch/x86/kernel/cpu/sgx/driver.c
> > > > @@ -0,0 +1,173 @@
> > > > +// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
> > > 
> > > You use gpl-only header files in this file, so how in the world can it
> > > be bsd-3 licensed?
> > > 
> > > Please get your legal department to agree with this, after you explain
> > > to them how you are mixing gpl2-only code in with this file.
> > 
> > This specifies license of driver.c, not of the headers included. Are
> > you saying that it is impossible to have a kernel driver with anything
> > else than GPL-2? That would be news to many, and that's not what
> > current consensus is.
> 
> If you want to write any non-GPL-2-only kernel code, you had better be
> consulting your lawyers and get very explicit instructions on how to do
> this in a way that does not violate any licenses.
> 
> I am not a lawyer, and will not be giving you any such advice, as I
> think it's not something that people should be doing.

You are pushing view that is well outside accepted community
consensus, then try to hide it by claiming that you are not a lawyer.

Stop it.

Dual licensed drivers are common in the kernel, and are considered
okay by everyone but you. Author is free to select license for his
work.

								Pavel
Dave Hansen Oct. 14, 2020, 8:16 p.m. UTC | #26
On 10/8/20 11:44 PM, Jarkko Sakkinen wrote:
>> Why is this so special that you need a subdirectory for a single driver
>> with a mere 2 device nodes?  Do any other misc drivers have a new
>> subdirectory in /dev/ for them?
> Absolutely nothing as far as I'm concerned. Should have done that
> already at the time when I switched to misc based on your feedback. I
> was acting too reactive I guess. For sure I'll rename.

Plus, if anyone *REALLY* cares, they can get their precious directory
back with a couple of lines of udev rules, I believe:

KERNEL=="sgx_provision", SYMLINK+="sgx/provision"
KERNEL=="sgx_enclave", SYMLINK+="sgx/enclave"
diff mbox series

Patch

diff --git a/arch/x86/kernel/cpu/sgx/Makefile b/arch/x86/kernel/cpu/sgx/Makefile
index 79510ce01b3b..3fc451120735 100644
--- a/arch/x86/kernel/cpu/sgx/Makefile
+++ b/arch/x86/kernel/cpu/sgx/Makefile
@@ -1,2 +1,4 @@ 
 obj-y += \
+	driver.o \
+	encl.o \
 	main.o
diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c
new file mode 100644
index 000000000000..f54da5f19c2b
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/driver.c
@@ -0,0 +1,173 @@ 
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
+// Copyright(c) 2016-18 Intel Corporation.
+
+#include <linux/acpi.h>
+#include <linux/miscdevice.h>
+#include <linux/mman.h>
+#include <linux/security.h>
+#include <linux/suspend.h>
+#include <asm/traps.h>
+#include "driver.h"
+#include "encl.h"
+
+u64 sgx_encl_size_max_32;
+u64 sgx_encl_size_max_64;
+u32 sgx_misc_reserved_mask;
+u64 sgx_attributes_reserved_mask;
+u64 sgx_xfrm_reserved_mask = ~0x3;
+u32 sgx_xsave_size_tbl[64];
+
+static int sgx_open(struct inode *inode, struct file *file)
+{
+	struct sgx_encl *encl;
+	int ret;
+
+	encl = kzalloc(sizeof(*encl), GFP_KERNEL);
+	if (!encl)
+		return -ENOMEM;
+
+	atomic_set(&encl->flags, 0);
+	kref_init(&encl->refcount);
+	xa_init(&encl->page_array);
+	mutex_init(&encl->lock);
+	INIT_LIST_HEAD(&encl->mm_list);
+	spin_lock_init(&encl->mm_lock);
+
+	ret = init_srcu_struct(&encl->srcu);
+	if (ret) {
+		kfree(encl);
+		return ret;
+	}
+
+	file->private_data = encl;
+
+	return 0;
+}
+
+static int sgx_release(struct inode *inode, struct file *file)
+{
+	struct sgx_encl *encl = file->private_data;
+	struct sgx_encl_mm *encl_mm;
+
+	for ( ; ; )  {
+		spin_lock(&encl->mm_lock);
+
+		if (list_empty(&encl->mm_list)) {
+			encl_mm = NULL;
+		} else {
+			encl_mm = list_first_entry(&encl->mm_list,
+						   struct sgx_encl_mm, list);
+			list_del_rcu(&encl_mm->list);
+		}
+
+		spin_unlock(&encl->mm_lock);
+
+		/* The list is empty, ready to go. */
+		if (!encl_mm)
+			break;
+
+		synchronize_srcu(&encl->srcu);
+		mmu_notifier_unregister(&encl_mm->mmu_notifier, encl_mm->mm);
+		kfree(encl_mm);
+	}
+
+	mutex_lock(&encl->lock);
+	atomic_or(SGX_ENCL_DEAD, &encl->flags);
+	mutex_unlock(&encl->lock);
+
+	kref_put(&encl->refcount, sgx_encl_release);
+	return 0;
+}
+
+static int sgx_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct sgx_encl *encl = file->private_data;
+	int ret;
+
+	ret = sgx_encl_may_map(encl, vma->vm_start, vma->vm_end, vma->vm_flags);
+	if (ret)
+		return ret;
+
+	ret = sgx_encl_mm_add(encl, vma->vm_mm);
+	if (ret)
+		return ret;
+
+	vma->vm_ops = &sgx_vm_ops;
+	vma->vm_flags |= VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | VM_IO;
+	vma->vm_private_data = encl;
+
+	return 0;
+}
+
+static unsigned long sgx_get_unmapped_area(struct file *file,
+					   unsigned long addr,
+					   unsigned long len,
+					   unsigned long pgoff,
+					   unsigned long flags)
+{
+	if ((flags & MAP_TYPE) == MAP_PRIVATE)
+		return -EINVAL;
+
+	if (flags & MAP_FIXED)
+		return addr;
+
+	return current->mm->get_unmapped_area(file, addr, len, pgoff, flags);
+}
+
+static const struct file_operations sgx_encl_fops = {
+	.owner			= THIS_MODULE,
+	.open			= sgx_open,
+	.release		= sgx_release,
+	.mmap			= sgx_mmap,
+	.get_unmapped_area	= sgx_get_unmapped_area,
+};
+
+static struct miscdevice sgx_dev_enclave = {
+	.minor = MISC_DYNAMIC_MINOR,
+	.name = "enclave",
+	.nodename = "sgx/enclave",
+	.fops = &sgx_encl_fops,
+};
+
+int __init sgx_drv_init(void)
+{
+	unsigned int eax, ebx, ecx, edx;
+	u64 attr_mask, xfrm_mask;
+	int ret;
+	int i;
+
+	if (!boot_cpu_has(X86_FEATURE_SGX_LC)) {
+		pr_info("The public key MSRs are not writable.\n");
+		return -ENODEV;
+	}
+
+	cpuid_count(SGX_CPUID, 0, &eax, &ebx, &ecx, &edx);
+	sgx_misc_reserved_mask = ~ebx | SGX_MISC_RESERVED_MASK;
+	sgx_encl_size_max_64 = 1ULL << ((edx >> 8) & 0xFF);
+	sgx_encl_size_max_32 = 1ULL << (edx & 0xFF);
+
+	cpuid_count(SGX_CPUID, 1, &eax, &ebx, &ecx, &edx);
+
+	attr_mask = (((u64)ebx) << 32) + (u64)eax;
+	sgx_attributes_reserved_mask = ~attr_mask | SGX_ATTR_RESERVED_MASK;
+
+	if (boot_cpu_has(X86_FEATURE_OSXSAVE)) {
+		xfrm_mask = (((u64)edx) << 32) + (u64)ecx;
+
+		for (i = 2; i < 64; i++) {
+			cpuid_count(0x0D, i, &eax, &ebx, &ecx, &edx);
+			if ((1UL << i) & xfrm_mask)
+				sgx_xsave_size_tbl[i] = eax + ebx;
+		}
+
+		sgx_xfrm_reserved_mask = ~xfrm_mask;
+	}
+
+	ret = misc_register(&sgx_dev_enclave);
+	if (ret) {
+		pr_err("Creating /dev/sgx/enclave failed with %d.\n", ret);
+		return ret;
+	}
+
+	return 0;
+}
diff --git a/arch/x86/kernel/cpu/sgx/driver.h b/arch/x86/kernel/cpu/sgx/driver.h
new file mode 100644
index 000000000000..f7ce40dedc91
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/driver.h
@@ -0,0 +1,29 @@ 
+/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
+#ifndef __ARCH_SGX_DRIVER_H__
+#define __ARCH_SGX_DRIVER_H__
+
+#include <crypto/hash.h>
+#include <linux/kref.h>
+#include <linux/mmu_notifier.h>
+#include <linux/radix-tree.h>
+#include <linux/rwsem.h>
+#include <linux/sched.h>
+#include <linux/workqueue.h>
+#include "sgx.h"
+
+#define SGX_EINIT_SPIN_COUNT	20
+#define SGX_EINIT_SLEEP_COUNT	50
+#define SGX_EINIT_SLEEP_TIME	20
+
+extern u64 sgx_encl_size_max_32;
+extern u64 sgx_encl_size_max_64;
+extern u32 sgx_misc_reserved_mask;
+extern u64 sgx_attributes_reserved_mask;
+extern u64 sgx_xfrm_reserved_mask;
+extern u32 sgx_xsave_size_tbl[64];
+
+long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
+
+int sgx_drv_init(void);
+
+#endif /* __ARCH_X86_SGX_DRIVER_H__ */
diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c
new file mode 100644
index 000000000000..c2c4a77af36b
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/encl.c
@@ -0,0 +1,331 @@ 
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
+// Copyright(c) 2016-18 Intel Corporation.
+
+#include <linux/lockdep.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/shmem_fs.h>
+#include <linux/suspend.h>
+#include <linux/sched/mm.h>
+#include "arch.h"
+#include "encl.h"
+#include "encls.h"
+#include "sgx.h"
+
+static struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl,
+						unsigned long addr)
+{
+	struct sgx_encl_page *entry;
+	unsigned int flags;
+
+	/* If process was forked, VMA is still there but vm_private_data is set
+	 * to NULL.
+	 */
+	if (!encl)
+		return ERR_PTR(-EFAULT);
+
+	flags = atomic_read(&encl->flags);
+	if ((flags & SGX_ENCL_DEAD) || !(flags & SGX_ENCL_INITIALIZED))
+		return ERR_PTR(-EFAULT);
+
+	entry = xa_load(&encl->page_array, PFN_DOWN(addr));
+	if (!entry)
+		return ERR_PTR(-EFAULT);
+
+	/* Page is already resident in the EPC. */
+	if (entry->epc_page)
+		return entry;
+
+	return ERR_PTR(-EFAULT);
+}
+
+static void sgx_mmu_notifier_release(struct mmu_notifier *mn,
+				     struct mm_struct *mm)
+{
+	struct sgx_encl_mm *encl_mm = container_of(mn, struct sgx_encl_mm, mmu_notifier);
+	struct sgx_encl_mm *tmp = NULL;
+
+	/*
+	 * The enclave itself can remove encl_mm.  Note, objects can't be moved
+	 * off an RCU protected list, but deletion is ok.
+	 */
+	spin_lock(&encl_mm->encl->mm_lock);
+	list_for_each_entry(tmp, &encl_mm->encl->mm_list, list) {
+		if (tmp == encl_mm) {
+			list_del_rcu(&encl_mm->list);
+			break;
+		}
+	}
+	spin_unlock(&encl_mm->encl->mm_lock);
+
+	if (tmp == encl_mm) {
+		synchronize_srcu(&encl_mm->encl->srcu);
+		mmu_notifier_put(mn);
+	}
+}
+
+static void sgx_mmu_notifier_free(struct mmu_notifier *mn)
+{
+	struct sgx_encl_mm *encl_mm = container_of(mn, struct sgx_encl_mm, mmu_notifier);
+
+	kfree(encl_mm);
+}
+
+static const struct mmu_notifier_ops sgx_mmu_notifier_ops = {
+	.release		= sgx_mmu_notifier_release,
+	.free_notifier		= sgx_mmu_notifier_free,
+};
+
+static struct sgx_encl_mm *sgx_encl_find_mm(struct sgx_encl *encl,
+					    struct mm_struct *mm)
+{
+	struct sgx_encl_mm *encl_mm = NULL;
+	struct sgx_encl_mm *tmp;
+	int idx;
+
+	idx = srcu_read_lock(&encl->srcu);
+
+	list_for_each_entry_rcu(tmp, &encl->mm_list, list) {
+		if (tmp->mm == mm) {
+			encl_mm = tmp;
+			break;
+		}
+	}
+
+	srcu_read_unlock(&encl->srcu, idx);
+
+	return encl_mm;
+}
+
+int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm)
+{
+	struct sgx_encl_mm *encl_mm;
+	int ret;
+
+	/* mm_list can be accessed only by a single thread at a time. */
+	mmap_assert_write_locked(mm);
+
+	if (atomic_read(&encl->flags) & SGX_ENCL_DEAD)
+		return -EINVAL;
+
+	/*
+	 * mm_structs are kept on mm_list until the mm or the enclave dies,
+	 * i.e. once an mm is off the list, it's gone for good, therefore it's
+	 * impossible to get a false positive on @mm due to a stale mm_list.
+	 */
+	if (sgx_encl_find_mm(encl, mm))
+		return 0;
+
+	encl_mm = kzalloc(sizeof(*encl_mm), GFP_KERNEL);
+	if (!encl_mm)
+		return -ENOMEM;
+
+	encl_mm->encl = encl;
+	encl_mm->mm = mm;
+	encl_mm->mmu_notifier.ops = &sgx_mmu_notifier_ops;
+
+	ret = __mmu_notifier_register(&encl_mm->mmu_notifier, mm);
+	if (ret) {
+		kfree(encl_mm);
+		return ret;
+	}
+
+	spin_lock(&encl->mm_lock);
+	list_add_rcu(&encl_mm->list, &encl->mm_list);
+	spin_unlock(&encl->mm_lock);
+
+	return 0;
+}
+
+static void sgx_vma_open(struct vm_area_struct *vma)
+{
+	struct sgx_encl *encl = vma->vm_private_data;
+
+	if (!encl)
+		return;
+
+	if (sgx_encl_mm_add(encl, vma->vm_mm))
+		vma->vm_private_data = NULL;
+}
+
+static unsigned int sgx_vma_fault(struct vm_fault *vmf)
+{
+	unsigned long addr = (unsigned long)vmf->address;
+	struct vm_area_struct *vma = vmf->vma;
+	struct sgx_encl *encl = vma->vm_private_data;
+	struct sgx_encl_page *entry;
+	int ret = VM_FAULT_NOPAGE;
+	unsigned long pfn;
+
+	if (!encl)
+		return VM_FAULT_SIGBUS;
+
+	mutex_lock(&encl->lock);
+
+	entry = sgx_encl_load_page(encl, addr);
+	if (IS_ERR(entry)) {
+		if (unlikely(PTR_ERR(entry) != -EBUSY))
+			ret = VM_FAULT_SIGBUS;
+
+		goto out;
+	}
+
+	if (!follow_pfn(vma, addr, &pfn))
+		goto out;
+
+	ret = vmf_insert_pfn(vma, addr, PFN_DOWN(entry->epc_page->desc));
+	if (ret != VM_FAULT_NOPAGE) {
+		ret = VM_FAULT_SIGBUS;
+		goto out;
+	}
+
+out:
+	mutex_unlock(&encl->lock);
+	return ret;
+}
+
+/**
+ * sgx_encl_may_map() - Check if a requested VMA mapping is allowed
+ * @encl:		an enclave pointer
+ * @start:		lower bound of the address range, inclusive
+ * @end:		upper bound of the address range, exclusive
+ * @vm_prot_bits:	requested protections of the address range
+ *
+ * Iterate through the enclave pages contained within [@start, @end) to verify
+ * the permissions requested by @vm_prot_bits do not exceed that of any enclave
+ * page to be mapped.
+ *
+ * Return:
+ *   0 on success,
+ *   -EACCES if VMA permissions exceed enclave page permissions
+ */
+int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start,
+		     unsigned long end, unsigned long vm_flags)
+{
+	unsigned long vm_prot_bits = vm_flags & (VM_READ | VM_WRITE | VM_EXEC);
+	unsigned long idx_start = PFN_DOWN(start);
+	unsigned long idx_end = PFN_DOWN(end - 1);
+	struct sgx_encl_page *page;
+
+	XA_STATE(xas, &encl->page_array, idx_start);
+
+	/*
+	 * Disallow READ_IMPLIES_EXEC tasks as their VMA permissions might
+	 * conflict with the enclave page permissions.
+	 */
+	if (current->personality & READ_IMPLIES_EXEC)
+		return -EACCES;
+
+	xas_for_each(&xas, page, idx_end)
+		if (!page || (~page->vm_max_prot_bits & vm_prot_bits))
+			return -EACCES;
+
+	return 0;
+}
+
+static int sgx_vma_mprotect(struct vm_area_struct *vma,
+			    struct vm_area_struct **pprev, unsigned long start,
+			    unsigned long end, unsigned long newflags)
+{
+	int ret;
+
+	ret = sgx_encl_may_map(vma->vm_private_data, start, end, newflags);
+	if (ret)
+		return ret;
+
+	return mprotect_fixup(vma, pprev, start, end, newflags);
+}
+
+const struct vm_operations_struct sgx_vm_ops = {
+	.open = sgx_vma_open,
+	.fault = sgx_vma_fault,
+	.mprotect = sgx_vma_mprotect,
+};
+
+/**
+ * sgx_encl_find - find an enclave
+ * @mm:		mm struct of the current process
+ * @addr:	address in the ELRANGE
+ * @vma:	the resulting VMA
+ *
+ * Find an enclave identified by the given address. Give back a VMA that is
+ * part of the enclave and located in that address. The VMA is given back if it
+ * is a proper enclave VMA even if an &sgx_encl instance does not exist yet
+ * (enclave creation has not been performed).
+ *
+ * Return:
+ *   0 on success,
+ *   -EINVAL if an enclave was not found,
+ *   -ENOENT if the enclave has not been created yet
+ */
+int sgx_encl_find(struct mm_struct *mm, unsigned long addr,
+		  struct vm_area_struct **vma)
+{
+	struct vm_area_struct *result;
+	struct sgx_encl *encl;
+
+	result = find_vma(mm, addr);
+	if (!result || result->vm_ops != &sgx_vm_ops || addr < result->vm_start)
+		return -EINVAL;
+
+	encl = result->vm_private_data;
+	*vma = result;
+
+	return encl ? 0 : -ENOENT;
+}
+
+/**
+ * sgx_encl_destroy() - destroy enclave resources
+ * @encl:	an enclave pointer
+ */
+void sgx_encl_destroy(struct sgx_encl *encl)
+{
+	struct sgx_encl_page *entry;
+	unsigned long index;
+
+	atomic_or(SGX_ENCL_DEAD, &encl->flags);
+
+	xa_for_each(&encl->page_array, index, entry) {
+		if (entry->epc_page) {
+			sgx_free_epc_page(entry->epc_page);
+			encl->secs_child_cnt--;
+			entry->epc_page = NULL;
+		}
+
+		kfree(entry);
+	}
+
+	xa_destroy(&encl->page_array);
+
+	if (!encl->secs_child_cnt && encl->secs.epc_page) {
+		sgx_free_epc_page(encl->secs.epc_page);
+		encl->secs.epc_page = NULL;
+	}
+}
+
+/**
+ * sgx_encl_release - Destroy an enclave instance
+ * @kref:	address of a kref inside &sgx_encl
+ *
+ * Used together with kref_put(). Frees all the resources associated with the
+ * enclave and the instance itself.
+ */
+void sgx_encl_release(struct kref *ref)
+{
+	struct sgx_encl *encl = container_of(ref, struct sgx_encl, refcount);
+
+	sgx_encl_destroy(encl);
+
+	if (encl->backing)
+		fput(encl->backing);
+
+	cleanup_srcu_struct(&encl->srcu);
+
+	WARN_ON_ONCE(!list_empty(&encl->mm_list));
+
+	/* Detect EPC page leak's. */
+	WARN_ON_ONCE(encl->secs_child_cnt);
+	WARN_ON_ONCE(encl->secs.epc_page);
+
+	kfree(encl);
+}
diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h
new file mode 100644
index 000000000000..8ff445476657
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/encl.h
@@ -0,0 +1,85 @@ 
+/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
+/**
+ * Copyright(c) 2016-19 Intel Corporation.
+ */
+#ifndef _X86_ENCL_H
+#define _X86_ENCL_H
+
+#include <linux/cpumask.h>
+#include <linux/kref.h>
+#include <linux/list.h>
+#include <linux/mm_types.h>
+#include <linux/mmu_notifier.h>
+#include <linux/mutex.h>
+#include <linux/notifier.h>
+#include <linux/srcu.h>
+#include <linux/workqueue.h>
+#include <linux/xarray.h>
+#include "sgx.h"
+
+/**
+ * enum sgx_encl_page_desc - defines bits for an enclave page's descriptor
+ * %SGX_ENCL_PAGE_ADDR_MASK:		Holds the virtual address of the page.
+ *
+ * The page address for SECS is zero and is used by the subsystem to recognize
+ * the SECS page.
+ */
+enum sgx_encl_page_desc {
+	/* Bits 11:3 are available when the page is not swapped. */
+	SGX_ENCL_PAGE_ADDR_MASK		= PAGE_MASK,
+};
+
+#define SGX_ENCL_PAGE_ADDR(page) \
+	((page)->desc & SGX_ENCL_PAGE_ADDR_MASK)
+
+struct sgx_encl_page {
+	unsigned long desc;
+	unsigned long vm_max_prot_bits;
+	struct sgx_epc_page *epc_page;
+	struct sgx_encl *encl;
+};
+
+enum sgx_encl_flags {
+	SGX_ENCL_CREATED	= BIT(0),
+	SGX_ENCL_INITIALIZED	= BIT(1),
+	SGX_ENCL_DEBUG		= BIT(2),
+	SGX_ENCL_DEAD		= BIT(3),
+	SGX_ENCL_IOCTL		= BIT(4),
+};
+
+struct sgx_encl_mm {
+	struct sgx_encl *encl;
+	struct mm_struct *mm;
+	struct list_head list;
+	struct mmu_notifier mmu_notifier;
+};
+
+struct sgx_encl {
+	atomic_t flags;
+	unsigned int page_cnt;
+	unsigned int secs_child_cnt;
+	struct mutex lock;
+	struct list_head mm_list;
+	spinlock_t mm_lock;
+	struct file *backing;
+	struct kref refcount;
+	struct srcu_struct srcu;
+	unsigned long base;
+	unsigned long size;
+	unsigned long ssaframesize;
+	struct xarray page_array;
+	struct sgx_encl_page secs;
+	cpumask_t cpumask;
+};
+
+extern const struct vm_operations_struct sgx_vm_ops;
+
+int sgx_encl_find(struct mm_struct *mm, unsigned long addr,
+		  struct vm_area_struct **vma);
+void sgx_encl_destroy(struct sgx_encl *encl);
+void sgx_encl_release(struct kref *ref);
+int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm);
+int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start,
+		     unsigned long end, unsigned long vm_flags);
+
+#endif /* _X86_ENCL_H */
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index 97c6895fb6c9..4137254fb29e 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -9,6 +9,8 @@ 
 #include <linux/sched/mm.h>
 #include <linux/sched/signal.h>
 #include <linux/slab.h>
+#include "driver.h"
+#include "encl.h"
 #include "encls.h"
 
 struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS];
@@ -260,6 +262,8 @@  static bool __init sgx_page_cache_init(void)
 
 static void __init sgx_init(void)
 {
+	int ret;
+
 	if (!boot_cpu_has(X86_FEATURE_SGX))
 		return;
 
@@ -269,8 +273,15 @@  static void __init sgx_init(void)
 	if (!sgx_page_reclaimer_init())
 		goto err_page_cache;
 
+	ret = sgx_drv_init();
+	if (ret)
+		goto err_kthread;
+
 	return;
 
+err_kthread:
+	kthread_stop(ksgxswapd_tsk);
+
 err_page_cache:
 	sgx_page_cache_teardown();
 }