diff mbox series

[RFC,v1,2/3] LSM/x86/sgx: Implement SGX specific hooks in SELinux

Message ID a382d46f66756e13929ca9244479dd9f689c470e.1560131039.git.cedric.xing@intel.com (mailing list archive)
State New, archived
Headers show
Series security/x86/sgx: SGX specific LSM hooks | expand

Commit Message

Xing, Cedric June 10, 2019, 7:03 a.m. UTC
In this patch, SELinux maintains two bits per enclave page, namely SGX__EXECUTE
and SGX__EXECMOD.

SGX__EXECUTE is set initially (by selinux_enclave_load) for every enclave page
that was loaded from a potentially executable source page. SGX__EXECMOD is set
for every page that was loaded from a file that has FILE__EXECMOD.

At runtime, on every protection change (resulted in a call to
selinux_file_mprotect), SGX__EXECUTE is cleared for a page if VM_WRITE is
requested, unless SGX__EXECMOD is set.

To track enclave page protection changes, SELinux has been changed in four
different places.

Firstly, storage is required for storing per page SGX__EXECUTE and SGX__EXECMOD
bits. Given every enclave instance is uniquely tied to an open file (i.e.
struct file), the storage is allocated by extending `file_security_struct`.
More precisely, a new field `esec` has been added, initially zero, to point to
the data structure for tracking per page protection. `esec` will be
allocated/initialized at the first invocation of selinux_enclave_load().

Then, selinux_enclave_load() initializes those 2 bits for every new enclave as
described above. One more detail worth noting, is that selinux_enclave_load()
sets SGX__EXECUTE/SGX__EXECMOD for EAUG'ed pages (for upcoming SGX2) only if
the calling process has FILE__EXECMOD on the sigstruct file.

Afterwards, every change on protection will go through selinux_file_mprotect()
so will be noted. Please note that user space could munmap() then mmap() to
work around mprotect(), but that "leak" could be "plugged" by SGX subsystem
calling security_file_mprotect() explicitly whenever new mappings are created.

Finally, the storage for page protection tracking must be freed when the
associated file is closed. Hence a new selinux_file_free_security() has been
added.

Signed-off-by: Cedric Xing <cedric.xing@intel.com>
---
 security/selinux/Makefile            |   2 +
 security/selinux/hooks.c             |  77 ++++++-
 security/selinux/include/intel_sgx.h |  18 ++
 security/selinux/include/objsec.h    |   3 +
 security/selinux/intel_sgx.c         | 292 +++++++++++++++++++++++++++
 5 files changed, 391 insertions(+), 1 deletion(-)
 create mode 100644 security/selinux/include/intel_sgx.h
 create mode 100644 security/selinux/intel_sgx.c

Comments

Stephen Smalley June 11, 2019, 1:40 p.m. UTC | #1
On 6/10/19 3:03 AM, Cedric Xing wrote:
> In this patch, SELinux maintains two bits per enclave page, namely SGX__EXECUTE
> and SGX__EXECMOD.
> 
> SGX__EXECUTE is set initially (by selinux_enclave_load) for every enclave page
> that was loaded from a potentially executable source page. SGX__EXECMOD is set
> for every page that was loaded from a file that has FILE__EXECMOD.
> 
> At runtime, on every protection change (resulted in a call to
> selinux_file_mprotect), SGX__EXECUTE is cleared for a page if VM_WRITE is
> requested, unless SGX__EXECMOD is set.
> 
> To track enclave page protection changes, SELinux has been changed in four
> different places.
> 
> Firstly, storage is required for storing per page SGX__EXECUTE and SGX__EXECMOD
> bits. Given every enclave instance is uniquely tied to an open file (i.e.
> struct file), the storage is allocated by extending `file_security_struct`.
> More precisely, a new field `esec` has been added, initially zero, to point to
> the data structure for tracking per page protection. `esec` will be
> allocated/initialized at the first invocation of selinux_enclave_load().
> 
> Then, selinux_enclave_load() initializes those 2 bits for every new enclave as
> described above. One more detail worth noting, is that selinux_enclave_load()
> sets SGX__EXECUTE/SGX__EXECMOD for EAUG'ed pages (for upcoming SGX2) only if
> the calling process has FILE__EXECMOD on the sigstruct file.
> 
> Afterwards, every change on protection will go through selinux_file_mprotect()
> so will be noted. Please note that user space could munmap() then mmap() to
> work around mprotect(), but that "leak" could be "plugged" by SGX subsystem
> calling security_file_mprotect() explicitly whenever new mappings are created.
> 
> Finally, the storage for page protection tracking must be freed when the
> associated file is closed. Hence a new selinux_file_free_security() has been
> added.
> 
> Signed-off-by: Cedric Xing <cedric.xing@intel.com>
> ---
>   security/selinux/Makefile            |   2 +
>   security/selinux/hooks.c             |  77 ++++++-
>   security/selinux/include/intel_sgx.h |  18 ++
>   security/selinux/include/objsec.h    |   3 +
>   security/selinux/intel_sgx.c         | 292 +++++++++++++++++++++++++++
>   5 files changed, 391 insertions(+), 1 deletion(-)
>   create mode 100644 security/selinux/include/intel_sgx.h
>   create mode 100644 security/selinux/intel_sgx.c
> 
> diff --git a/security/selinux/Makefile b/security/selinux/Makefile
> index ccf950409384..58a05a9639e0 100644
> --- a/security/selinux/Makefile
> +++ b/security/selinux/Makefile
> @@ -14,6 +14,8 @@ selinux-$(CONFIG_SECURITY_NETWORK_XFRM) += xfrm.o
>   
>   selinux-$(CONFIG_NETLABEL) += netlabel.o
>   
> +selinux-$(CONFIG_INTEL_SGX) += intel_sgx.o
> +
>   ccflags-y := -I$(srctree)/security/selinux -I$(srctree)/security/selinux/include
>   
>   $(addprefix $(obj)/,$(selinux-y)): $(obj)/flask.h
> diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
> index 3ec702cf46ca..17f855871a41 100644
> --- a/security/selinux/hooks.c
> +++ b/security/selinux/hooks.c
> @@ -103,6 +103,7 @@
>   #include "netlabel.h"
>   #include "audit.h"
>   #include "avc_ss.h"
> +#include "intel_sgx.h"
>   
>   struct selinux_state selinux_state;
>   
> @@ -3485,6 +3486,11 @@ static int selinux_file_alloc_security(struct file *file)
>   	return file_alloc_security(file);
>   }
>   
> +static void selinux_file_free_security(struct file *file)
> +{
> +	sgxsec_enclave_free(file);
> +}
> +
>   /*
>    * Check whether a task has the ioctl permission and cmd
>    * operation to an inode.
> @@ -3656,6 +3662,7 @@ static int selinux_file_mprotect(struct vm_area_struct *vma,
>   				 unsigned long reqprot,
>   				 unsigned long prot)
>   {
> +	int rc;
>   	const struct cred *cred = current_cred();
>   	u32 sid = cred_sid(cred);
>   
> @@ -3664,7 +3671,7 @@ static int selinux_file_mprotect(struct vm_area_struct *vma,
>   
>   	if (default_noexec &&
>   	    (prot & PROT_EXEC) && !(vma->vm_flags & VM_EXEC)) {
> -		int rc = 0;
> +		rc = 0;
>   		if (vma->vm_start >= vma->vm_mm->start_brk &&
>   		    vma->vm_end <= vma->vm_mm->brk) {
>   			rc = avc_has_perm(&selinux_state,
> @@ -3691,6 +3698,12 @@ static int selinux_file_mprotect(struct vm_area_struct *vma,
>   			return rc;
>   	}
>   
> +#ifdef CONFIG_INTEL_SGX
> +	rc = sgxsec_mprotect(vma, prot);
> +	if (rc <= 0)
> +		return rc;

Why are you skipping the file_map_prot_check() call when rc == 0?
What would SELinux check if you didn't do so - 
FILE__READ|FILE__WRITE|FILE__EXECUTE to /dev/sgx/enclave?  Is it a 
problem to let SELinux proceed with that check?

> +#endif
> +
>   	return file_map_prot_check(vma->vm_file, prot, vma->vm_flags&VM_SHARED);
>   }
>   
> @@ -6726,6 +6739,62 @@ static void selinux_bpf_prog_free(struct bpf_prog_aux *aux)
>   }
>   #endif
>   
> +#ifdef CONFIG_INTEL_SGX
> +
> +static int selinux_enclave_load(struct file *encl, unsigned long addr,
> +				unsigned long size, unsigned long prot,
> +				struct vm_area_struct *source)
> +{
> +	if (source) {
> +		/**
> +		 * Adding page from source => EADD request
> +		 */
> +		int rc = selinux_file_mprotect(source, prot, prot);
> +		if (rc)
> +			return rc;
> +
> +		if (!(prot & VM_EXEC) &&
> +		    selinux_file_mprotect(source, VM_EXEC, VM_EXEC))

I wouldn't conflate VM_EXEC with PROT_EXEC even if they happen to be 
defined with the same values currently.  Elsewhere the kernel appears to 
explicitly translate them ala calc_vm_prot_bits().

Also, this will mean that we will always perform an execute check on all 
sources, thereby triggering audit denial messages for any EADD sources 
that are only intended to be data.  Depending on the source, this could 
trigger PROCESS__EXECMEM or FILE__EXECMOD or FILE__EXECUTE.  In a world 
where users often just run any denials they see through audit2allow, 
they'll end up always allowing them all.  How can they tell whether it 
was needed? It would be preferable if we could only trigger execute 
checks when there is some probability that execute will be requested in 
the future.  Alternatives would be to silence the audit of these 
permission checks always via use of _noaudit() interfaces or to silence 
audit of these permissions via dontaudit rules in policy, but the latter 
would hide all denials of the permission by the process, not just those 
triggered from security_enclave_load().  And if we silence them, then we 
won't see them even if they were needed.

> +			prot = 0;
> +		else {
> +			prot = SGX__EXECUTE;
> +			if (source->vm_file &&
> +			    !file_has_perm(current_cred(), source->vm_file,
> +					   FILE__EXECMOD))
> +				prot |= SGX__EXECMOD;

Similarly, this means that we will always perform a FILE__EXECMOD check 
on all executable sources, triggering audit denial messages for any EADD 
source that is executable but to which EXECMOD is not allowed, and again 
the most common pattern will be that users will add EXECMOD to all 
executable sources to avoid this.

> +		}
> +		return sgxsec_eadd(encl, addr, size, prot);
> +	} else {
> +		/**
> +		  * Adding page from NULL => EAUG request
> +		  */
> +		return sgxsec_eaug(encl, addr, size, prot);
> +	}
> +}
> +
> +static int selinux_enclave_init(struct file *encl,
> +				const struct sgx_sigstruct *sigstruct,
> +				struct vm_area_struct *vma)
> +{
> +	int rc = 0;
> +
> +	if (!vma)
> +		rc = -EINVAL;

Is it ever valid to call this hook with a NULL vma?  If not, this should 
be handled/prevented by the caller.  If so, I'd just return -EINVAL 
immediately here.

> +
> +	if (!rc && !(vma->vm_flags & VM_EXEC))
> +		rc = selinux_file_mprotect(vma, VM_EXEC, VM_EXEC);

I had thought we were trying to avoid overloading FILE__EXECUTE (or 
whatever gets checked here, e.g. could be PROCESS__EXECMEM or 
FILE__EXECMOD) on the sigstruct file, since the caller isn't truly 
executing code from it.

I'd define new ENCLAVE__* permissions, including an up-front 
ENCLAVE__INIT permission that governs whether the sigstruct file can be 
used at all irrespective of memory protections.

Then you can also have ENCLAVE__EXECUTE, ENCLAVE__EXECMEM, 
ENCLAVE__EXECMOD for the execute-related checks.  Or you can use the 
/dev/sgx/enclave inode as the target for the execute checks and just 
reuse the file permissions there.

> +
> +	if (!rc) {
> +		if (vma->vm_file)
> +			rc = file_has_perm(current_cred(), vma->vm_file,
> +					   FILE__EXECMOD);

Similar issue here with always triggering EXECMOD audit denials even if 
never required.

> +		rc = sgxsec_einit(encl, sigstruct, !rc);
> +	}
> +	return rc;
> +}
> +
> +#endif
> +
>   struct lsm_blob_sizes selinux_blob_sizes __lsm_ro_after_init = {
>   	.lbs_cred = sizeof(struct task_security_struct),
>   	.lbs_file = sizeof(struct file_security_struct),
> @@ -6808,6 +6877,7 @@ static struct security_hook_list selinux_hooks[] __lsm_ro_after_init = {
>   
>   	LSM_HOOK_INIT(file_permission, selinux_file_permission),
>   	LSM_HOOK_INIT(file_alloc_security, selinux_file_alloc_security),
> +	LSM_HOOK_INIT(file_free_security, selinux_file_free_security),
>   	LSM_HOOK_INIT(file_ioctl, selinux_file_ioctl),
>   	LSM_HOOK_INIT(mmap_file, selinux_mmap_file),
>   	LSM_HOOK_INIT(mmap_addr, selinux_mmap_addr),
> @@ -6968,6 +7038,11 @@ static struct security_hook_list selinux_hooks[] __lsm_ro_after_init = {
>   	LSM_HOOK_INIT(bpf_map_free_security, selinux_bpf_map_free),
>   	LSM_HOOK_INIT(bpf_prog_free_security, selinux_bpf_prog_free),
>   #endif
> +
> +#ifdef CONFIG_INTEL_SGX
> +	LSM_HOOK_INIT(enclave_load, selinux_enclave_load),
> +	LSM_HOOK_INIT(enclave_init, selinux_enclave_init),
> +#endif
>   };
>   
>   static __init int selinux_init(void)
> diff --git a/security/selinux/include/intel_sgx.h b/security/selinux/include/intel_sgx.h
> new file mode 100644
> index 000000000000..8f9c6c734921
> --- /dev/null
> +++ b/security/selinux/include/intel_sgx.h
> @@ -0,0 +1,18 @@
> +// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
> +// Copyright(c) 2016-18 Intel Corporation.
> +
> +#ifndef _SELINUX_SGXSEC_H_
> +#define _SELINUX_SGXSEC_H_
> +
> +#include <linux/lsm_hooks.h>
> +
> +#define SGX__EXECUTE	1
> +#define SGX__EXECMOD	2
> +
> +void sgxsec_enclave_free(struct file *);
> +int sgxsec_mprotect(struct vm_area_struct *, size_t);
> +int sgxsec_eadd(struct file *, size_t, size_t, size_t);
> +int sgxsec_eaug(struct file *, size_t, size_t, size_t);
> +int sgxsec_einit(struct file *, const struct sgx_sigstruct *, int);
> +
> +#endif
> diff --git a/security/selinux/include/objsec.h b/security/selinux/include/objsec.h
> index 231262d8eac9..0fb4da7e3a8a 100644
> --- a/security/selinux/include/objsec.h
> +++ b/security/selinux/include/objsec.h
> @@ -71,6 +71,9 @@ struct file_security_struct {
>   	u32 fown_sid;		/* SID of file owner (for SIGIO) */
>   	u32 isid;		/* SID of inode at the time of file open */
>   	u32 pseqno;		/* Policy seqno at the time of file open */
> +#ifdef CONFIG_INTEL_SGX
> +	atomic_long_t esec;
> +#endif
>   };
>   
>   struct superblock_security_struct {
> diff --git a/security/selinux/intel_sgx.c b/security/selinux/intel_sgx.c
> new file mode 100644
> index 000000000000..37dacf5c295f
> --- /dev/null
> +++ b/security/selinux/intel_sgx.c
> @@ -0,0 +1,292 @@
> +// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
> +// Copyright(c) 2016-18 Intel Corporation.
> +
> +#include "objsec.h"
> +#include "intel_sgx.h"
> +
> +struct region {
> +	struct list_head	link;
> +	size_t			start;
> +	size_t			end;
> +	size_t			data;
> +};
> +
> +static inline struct region *region_new(void)
> +{
> +	struct region *n = kzalloc(sizeof(struct region), GFP_KERNEL);
> +	if (n)
> +		INIT_LIST_HEAD(&n->link);
> +	return n;
> +}
> +
> +static inline void region_free(struct region *r)
> +{
> +	list_del(&r->link);
> +	kfree(r);
> +}
> +
> +static struct list_head *
> +region_apply_to_range(struct list_head *rgs,
> +		      size_t start, size_t end,
> +		      struct list_head *(*cb)(struct region *,
> +					      size_t, size_t, size_t),
> +		      size_t arg)
> +{
> +	struct region *r, *n;
> +
> +	list_for_each_entry(r, rgs, link)
> +		if (start < r-> end)
> +			break;
> +
> +	if (&r->link == rgs || end <= r->start)
> +		return rgs;
> +
> +	do {
> +		struct list_head *ret;
> +		n = list_next_entry(r, link);
> +		ret = (*cb)(r, start, end, arg);
> +		if (ret)
> +			return ret;
> +		r = n;
> +	} while (&r->link != rgs && r->start < end);
> +	return &r->link;
> +}
> +
> +static struct list_head *
> +region_clear_cb(struct region *r, size_t start, size_t end, size_t arg)
> +{
> +	if (end < r->end) {
> +		if (start > r->start) {
> +			struct region *n = region_new();
> +			if (unlikely(!n))
> +				return ERR_PTR(-ENOMEM);
> +
> +			n->start = r->start;
> +			n->end = start;
> +			n->data = r->data;
> +			list_add_tail(&n->link, &r->link);
> +		}
> +		r->start = end;
> +		return &r->link;
> +	}
> +
> +	if (start > r->start)
> +		r->end = start;
> +	else
> +		region_free(r);
> +	return NULL;
> +}
> +
> +static inline struct list_head *
> +region_clear_range(struct list_head *rgs, size_t start, size_t end)
> +{
> +	return region_apply_to_range(rgs, start, end, region_clear_cb, 0);
> +}
> +
> +static struct list_head *
> +region_add_range(struct list_head *rgs, size_t start, size_t end, size_t data)
> +{
> +	struct region *r, *n;
> +
> +	n = list_entry(region_clear_range(rgs, start, end), typeof(*n), link);
> +	if (unlikely(IS_ERR_VALUE(&n->link)))
> +		return &n->link;
> +
> +	if (&n->link != rgs && end == n->start && data == n->data) {
> +		n->start = start;
> +		r = n;
> +	} else {
> +		r = region_new();
> +		if (unlikely(!r))
> +			return ERR_PTR(-ENOMEM);
> +
> +		r->start = start;
> +		r->end = end;
> +		r->data = data;
> +		list_add_tail(&r->link, &n->link);
> +	}
> +
> +	n = list_prev_entry(r, link);
> +	if (&n->link != rgs && start == n->end && data == n->data) {
> +		r->start = n->start;
> +		region_free(n);
> +	}
> +
> +	return &r->link;
> +}
> +
> +static inline int
> +enclave_add_pages(struct list_head *rgs, size_t start, size_t end, size_t flags)
> +{
> +	void *p = region_add_range(rgs, start, end, flags);
> +	return PTR_ERR_OR_ZERO(p);
> +}
> +
> +static inline int enclave_prot_allowed(size_t prot, size_t flags)
> +{
> +	return !(prot & VM_EXEC) || (flags & SGX__EXECUTE);
> +}
> +
> +static struct list_head *
> +enclave_prot_check_cb(struct region *r, size_t start, size_t end, size_t prot)
> +{
> +	if (!enclave_prot_allowed(prot, r->data))
> +		return ERR_PTR(-EACCES);
> +	return NULL;
> +}
> +
> +static struct list_head *
> +enclave_prot_set_cb(struct region *r, size_t start, size_t end, size_t prot)
> +{
> +	BUG_ON(!enclave_prot_allowed(prot, r->data));
> +
> +	if (!(prot & VM_WRITE) ||
> +	    (r->data & SGX__EXECMOD) ||
> +	    !(r->data & SGX__EXECUTE))
> +		return NULL;
> +
> +	if (end < r->end) {
> +		struct region *n = region_new();
> +		if (unlikely(!n))
> +			return ERR_PTR(-ENOMEM);
> +
> +		n->start = end;
> +		n->end = r->end;
> +		n->data = r->data;
> +		r->end = end;
> +		list_add(&n->link, &r->link);
> +	}
> +
> +	if (start > r->start) {
> +		struct region *n = region_new();
> +		if (unlikely(!n))
> +			return ERR_PTR(-ENOMEM);
> +
> +		n->start = r->start;
> +		n->end = start;
> +		n->data = r->data;
> +		r->start = start;
> +		list_add_tail(&n->link, &r->link);
> +	}
> +
> +	r->data &= ~SGX__EXECUTE;
> +	return NULL;
> +}
> +
> +static inline int
> +enclave_mprotect(struct list_head *rgs, size_t start, size_t end, size_t prot)
> +{
> +	void *ret;
> +
> +	ret = region_apply_to_range(rgs, start, end,
> +				    enclave_prot_check_cb, prot);
> +	if (!IS_ERR_VALUE(ret) && (prot & VM_WRITE))
> +		ret = region_apply_to_range(rgs, start, end,
> +					    enclave_prot_set_cb, prot);
> +	return PTR_ERR_OR_ZERO(ret);
> +}
> +
> +struct enclave_sec {
> +	struct rw_semaphore	sem;
> +	struct list_head	regions;
> +	size_t			eaug_perm;
> +};
> +
> +static inline struct enclave_sec *__esec(struct file_security_struct *fsec)
> +{
> +	return (struct enclave_sec *)atomic_long_read(&fsec->esec);
> +}
> +
> +static struct enclave_sec *encl_esec(struct file *encl)
> +{
> +	struct file_security_struct *fsec = selinux_file(encl);
> +	struct enclave_sec *esec = __esec(fsec);
> +
> +	if (unlikely(!esec)) {
> +		long n;
> +
> +		esec = kzalloc(sizeof(*esec), GFP_KERNEL);
> +		if (!esec)
> +			return NULL;
> +
> +		init_rwsem(&esec->sem);
> +		INIT_LIST_HEAD(&esec->regions);
> +
> +		n = atomic_long_cmpxchg(&fsec->esec, 0, (long)esec);
> +		if (n) {
> +			kfree(esec);
> +			esec = (typeof(esec))n;
> +		}
> +	}
> +
> +	return esec;
> +}
> +
> +void sgxsec_enclave_free(struct file *encl)
> +{
> +	struct enclave_sec *esec = __esec(selinux_file(encl));
> +
> +	if (esec) {
> +		struct region *r, *n;
> +
> +		BUG_ON(rwsem_is_locked(&esec->sem));
> +
> +		list_for_each_entry_safe(r, n, &esec->regions, link)
> +			region_free(r);
> +
> +		kfree(esec);
> +	}
> +}
> +
> +int sgxsec_mprotect(struct vm_area_struct *vma, size_t prot)
> +{
> +	struct enclave_sec *esec;
> +	int rc;
> +
> +	if (!vma->vm_file || !(esec = __esec(selinux_file(vma->vm_file)))) {
> +		/* Positive return value indicates non-enclave VMA */
> +		return 1;
> +	}
> +
> +	down_read(&esec->sem);
> +	rc = enclave_mprotect(&esec->regions, vma->vm_start, vma->vm_end, prot);

Why is it safe for this to only use down_read()? enclave_mprotect() can 
call enclave_prot_set_cb() which modifies the list?

> +	up_read(&esec->sem);
> +	return rc;
> +}
> +
> +int sgxsec_eadd(struct file *encl, size_t start, size_t size, size_t perm)
> +{
> +	struct enclave_sec *esec = encl_esec(encl);
> +	int rc;
> +
> +	if (down_write_killable(&esec->sem))
> +		return -EINTR;
> +	rc = enclave_add_pages(&esec->regions, start, start + size, perm);
> +	up_write(&esec->sem);
> +	return rc;
> +}
> +
> +int sgxsec_eaug(struct file *encl, size_t start, size_t size, size_t prot)
> +{
> +	struct enclave_sec *esec = encl_esec(encl);
> +	int rc = -EPERM;
> +
> +	if (down_write_killable(&esec->sem))
> +		return -EINTR;
> +	if (enclave_prot_allowed(prot, esec->eaug_perm))
> +		rc = enclave_add_pages(&esec->regions, start, start + size,
> +				       esec->eaug_perm);
> +	up_write(&esec->sem);
> +	return rc;
> +}
> +
> +int sgxsec_einit(struct file *encl, const struct sgx_sigstruct *sigstruct, int execmod)
> +{
> +	struct enclave_sec *esec = encl_esec(encl);
> +
> +	if (down_write_killable(&esec->sem))
> +		return -EINTR;
> +	esec->eaug_perm = execmod ? SGX__EXECUTE | SGX__EXECMOD : 0;
> +	up_write(&esec->sem);
> +	return 0;
> +}

I haven't looked at this code closely, but it feels like a lot of 
SGX-specific logic embedded into SELinux that will have to be repeated 
or reused for every security module.  Does SGX not track this state itself?
Sean Christopherson June 11, 2019, 10:02 p.m. UTC | #2
On Tue, Jun 11, 2019 at 09:40:25AM -0400, Stephen Smalley wrote:
> I haven't looked at this code closely, but it feels like a lot of
> SGX-specific logic embedded into SELinux that will have to be repeated or
> reused for every security module.  Does SGX not track this state itself?

SGX does track equivalent state.

There are three proposals on the table (I think):

  1. Require userspace to explicitly specificy (maximal) enclave page
     permissions at build time.  The enclave page permissions are provided
     to, and checked by, LSMs at enclave build time.

     Pros: Low-complexity kernel implementation, straightforward auditing
     Cons: Sullies the SGX UAPI to some extent, may increase complexity of
           SGX2 enclave loaders.

  2. Pre-check LSM permissions and dynamically track mappings to enclave
     pages, e.g. add an SGX mprotect() hook to restrict W->X and WX
     based on the pre-checked permissions.

     Pros: Does not impact SGX UAPI, medium kernel complexity
     Cons: Auditing is complex/weird, requires taking enclave-specific
           lock during mprotect() to query/update tracking.

  3. Implement LSM hooks in SGX to allow LSMs to track enclave regions
     from cradle to grave, but otherwise defer everything to LSMs.

     Pros: Does not impact SGX UAPI, maximum flexibility, precise auditing
     Cons: Most complex and "heaviest" kernel implementation of the three,
           pushes more SGX details into LSMs.

My RFC series[1] implements #1.  My understanding is that Andy (Lutomirski)
prefers #2.  Cedric's RFC series implements #3.

Perhaps the easiest way to make forward progress is to rule out the
options we absolutely *don't* want by focusing on the potentially blocking
issue with each option:

  #1 - SGX UAPI funkiness

  #2 - Auditing complexity, potential enclave lock contention

  #3 - Pushing SGX details into LSMs and complexity of kernel implementation


[1] https://lkml.kernel.org/r/20190606021145.12604-1-sean.j.christopherson@intel.com
Xing, Cedric June 11, 2019, 10:55 p.m. UTC | #3
> From: linux-sgx-owner@vger.kernel.org [mailto:linux-sgx-
> owner@vger.kernel.org] On Behalf Of Stephen Smalley
> Sent: Tuesday, June 11, 2019 6:40 AM
> 
> >
> > +#ifdef CONFIG_INTEL_SGX
> > +	rc = sgxsec_mprotect(vma, prot);
> > +	if (rc <= 0)
> > +		return rc;
> 
> Why are you skipping the file_map_prot_check() call when rc == 0?
> What would SELinux check if you didn't do so -
> FILE__READ|FILE__WRITE|FILE__EXECUTE to /dev/sgx/enclave?  Is it a
> problem to let SELinux proceed with that check?

We can continue the check. But in practice, all FILE__{READ|WRITE|EXECUTE} are needed for every enclave, then what's the point of checking them? FILE__EXECMOD may be the only flag that has a meaning, but it's kind of redundant because sigstruct file was checked against that already.

> > +static int selinux_enclave_load(struct file *encl, unsigned long addr,
> > +				unsigned long size, unsigned long prot,
> > +				struct vm_area_struct *source)
> > +{
> > +	if (source) {
> > +		/**
> > +		 * Adding page from source => EADD request
> > +		 */
> > +		int rc = selinux_file_mprotect(source, prot, prot);
> > +		if (rc)
> > +			return rc;
> > +
> > +		if (!(prot & VM_EXEC) &&
> > +		    selinux_file_mprotect(source, VM_EXEC, VM_EXEC))
> 
> I wouldn't conflate VM_EXEC with PROT_EXEC even if they happen to be
> defined with the same values currently.  Elsewhere the kernel appears to
> explicitly translate them ala calc_vm_prot_bits().

Thanks! I'd change them to PROT_EXEC in the next version.

> 
> Also, this will mean that we will always perform an execute check on all
> sources, thereby triggering audit denial messages for any EADD sources
> that are only intended to be data.  Depending on the source, this could
> trigger PROCESS__EXECMEM or FILE__EXECMOD or FILE__EXECUTE.  In a world
> where users often just run any denials they see through audit2allow,
> they'll end up always allowing them all.  How can they tell whether it
> was needed? It would be preferable if we could only trigger execute
> checks when there is some probability that execute will be requested in
> the future.  Alternatives would be to silence the audit of these
> permission checks always via use of _noaudit() interfaces or to silence
> audit of these permissions via dontaudit rules in policy, but the latter
> would hide all denials of the permission by the process, not just those
> triggered from security_enclave_load().  And if we silence them, then we
> won't see them even if they were needed.

*_noaudit() is exactly what I wanted. But I couldn't find selinux_file_mprotect_noaudit()/file_has_perm_noaudit(), and I'm reluctant to duplicate code. Any suggestions?
 
> 
> > +			prot = 0;
> > +		else {
> > +			prot = SGX__EXECUTE;
> > +			if (source->vm_file &&
> > +			    !file_has_perm(current_cred(), source->vm_file,
> > +					   FILE__EXECMOD))
> > +				prot |= SGX__EXECMOD;
> 
> Similarly, this means that we will always perform a FILE__EXECMOD check
> on all executable sources, triggering audit denial messages for any EADD
> source that is executable but to which EXECMOD is not allowed, and again
> the most common pattern will be that users will add EXECMOD to all
> executable sources to avoid this.
> 
> > +		}
> > +		return sgxsec_eadd(encl, addr, size, prot);
> > +	} else {
> > +		/**
> > +		  * Adding page from NULL => EAUG request
> > +		  */
> > +		return sgxsec_eaug(encl, addr, size, prot);
> > +	}
> > +}
> > +
> > +static int selinux_enclave_init(struct file *encl,
> > +				const struct sgx_sigstruct *sigstruct,
> > +				struct vm_area_struct *vma)
> > +{
> > +	int rc = 0;
> > +
> > +	if (!vma)
> > +		rc = -EINVAL;
> 
> Is it ever valid to call this hook with a NULL vma?  If not, this should
> be handled/prevented by the caller.  If so, I'd just return -EINVAL
> immediately here.

vma shall never be NULL. I'll update it in the next version.

> 
> > +
> > +	if (!rc && !(vma->vm_flags & VM_EXEC))
> > +		rc = selinux_file_mprotect(vma, VM_EXEC, VM_EXEC);
> 
> I had thought we were trying to avoid overloading FILE__EXECUTE (or
> whatever gets checked here, e.g. could be PROCESS__EXECMEM or
> FILE__EXECMOD) on the sigstruct file, since the caller isn't truly
> executing code from it.

Agreed. Another problem with FILE__EXECMOD on the sigstruct file is that user code would then be allowed to modify SIGSTRUCT at will, which effectively wipes out the protection provided by FILE__EXECUTE.

> 
> I'd define new ENCLAVE__* permissions, including an up-front
> ENCLAVE__INIT permission that governs whether the sigstruct file can be
> used at all irrespective of memory protections.

Agreed.

> 
> Then you can also have ENCLAVE__EXECUTE, ENCLAVE__EXECMEM,
> ENCLAVE__EXECMOD for the execute-related checks.  Or you can use the
> /dev/sgx/enclave inode as the target for the execute checks and just
> reuse the file permissions there.

Now we've got 2 options - 1) New ENCLAVE__* flags on sigstruct file or 2) FILE__* on /dev/sgx/enclave. Which one do you think makes more sense?

ENCLAVE__EXECMEM seems to offer finer granularity (than PROCESS__EXECMEM) but I wonder if it'd have any real use in practice.

> > +int sgxsec_mprotect(struct vm_area_struct *vma, size_t prot) {
> > +	struct enclave_sec *esec;
> > +	int rc;
> > +
> > +	if (!vma->vm_file || !(esec = __esec(selinux_file(vma->vm_file))))
> {
> > +		/* Positive return value indicates non-enclave VMA */
> > +		return 1;
> > +	}
> > +
> > +	down_read(&esec->sem);
> > +	rc = enclave_mprotect(&esec->regions, vma->vm_start, vma->vm_end,
> > +prot);
> 
> Why is it safe for this to only use down_read()? enclave_mprotect() can
> call enclave_prot_set_cb() which modifies the list?

Probably because it was too late at night when I wrote this line:-( Good catch!

> 
> I haven't looked at this code closely, but it feels like a lot of SGX-
> specific logic embedded into SELinux that will have to be repeated or
> reused for every security module.  Does SGX not track this state itself?

I can tell you have looked quite closely, and I truly think you for your time!

You are right that there are SGX specific stuff. More precisely, SGX enclaves don't have access to anything except memory, so there are only 3 questions that need to be answered for each enclave page: 1) whether X is allowed; 2) whether W->X is allowed and 3 whether WX is allowed. This proposal tries to cache the answers to those questions upon creation of each enclave page, meaning it involves a) figuring out the answers and b) "remember" them for every page. #b is generic, mostly captured in intel_sgx.c, and could be shared among all LSM modules; while #a is SELinux specific. I could move intel_sgx.c up one level in the directory hierarchy if that's what you'd suggest.

By "SGX", did you mean the SGX subsystem being upstreamed? It doesn’t track that state. In practice, there's no way for SGX to track it because there's no vm_ops->may_mprotect() callback. It doesn't follow the philosophy of Linux either, as mprotect() doesn't track it for regular memory. And it doesn't have a use without LSM, so I believe it makes more sense to track it inside LSM.
Dr. Greg June 12, 2019, 9:32 a.m. UTC | #4
On Tue, Jun 11, 2019 at 03:02:43PM -0700, Sean Christopherson wrote:

Good morning, I hope the week is proceeding well for everyone.

> On Tue, Jun 11, 2019 at 09:40:25AM -0400, Stephen Smalley wrote:
> > I haven't looked at this code closely, but it feels like a lot of
> > SGX-specific logic embedded into SELinux that will have to be repeated or
> > reused for every security module.  Does SGX not track this state itself?

> SGX does track equivalent state.
> 
> There are three proposals on the table (I think):
> 
>   1. Require userspace to explicitly specificy (maximal) enclave page
>      permissions at build time.  The enclave page permissions are provided
>      to, and checked by, LSMs at enclave build time.
> 
>      Pros: Low-complexity kernel implementation, straightforward auditing
>      Cons: Sullies the SGX UAPI to some extent, may increase complexity of
>            SGX2 enclave loaders.
> 
>   2. Pre-check LSM permissions and dynamically track mappings to enclave
>      pages, e.g. add an SGX mprotect() hook to restrict W->X and WX
>      based on the pre-checked permissions.
> 
>      Pros: Does not impact SGX UAPI, medium kernel complexity
>      Cons: Auditing is complex/weird, requires taking enclave-specific
>            lock during mprotect() to query/update tracking.
> 
>   3. Implement LSM hooks in SGX to allow LSMs to track enclave regions
>      from cradle to grave, but otherwise defer everything to LSMs.
> 
>      Pros: Does not impact SGX UAPI, maximum flexibility, precise auditing
>      Cons: Most complex and "heaviest" kernel implementation of the three,
>            pushes more SGX details into LSMs.
> 
> My RFC series[1] implements #1.  My understanding is that Andy (Lutomirski)
> prefers #2.  Cedric's RFC series implements #3.
> 
> Perhaps the easiest way to make forward progress is to rule out the
> options we absolutely *don't* want by focusing on the potentially blocking
> issue with each option:
>
>   #1 - SGX UAPI funkiness
> 
>   #2 - Auditing complexity, potential enclave lock contention
> 
>   #3 - Pushing SGX details into LSMs and complexity of kernel implementation

At the risk of repeating myself, I believe the issue that has not
received full clarity is that, for a security relevant solution, there
has to be two separate aspects of LSM coverage for SGX.  I believe
that a high level review of the requirements may assist in selection
of a course of action for the driver.

The first aspect of LSM control has been covered extensively and that
is the notion of implementing control over the ability of a user
identity to request some cohort of page privileges.  The cohort of
obvious concern is the ability of a page to possess both WRITE and
EXECUTE privileges at sometime during its lifetime.

Given that SGX2 support is the ultimate and necesary goal for this
driver, the selected proposal should be the one that gives the most
simplistic application of this policy.  As I have noted previously,
once SGX2 becomes available, the only relevant security control that
can be realized with this type of LSM support is whether or not the
platform owner wishes to limit access by a user identity to the
ability to dynamically load code in enclave context.

With SGX2 we will, by necessity, have to admit the notion that a
platform owner will not have any effective visibility into code that
is loaded and executed, since it can come in over a secured network
connection in an enclave security context.  This advocates for the
simplest approach possible to providing some type of regulation to any
form of WX page access.

Current state of the art, and there doesn't appear to be a reason to
change this, is to package an enclave in the form of an ELF shared
library.  It seems straight forward to inherit and act on page
privileges from the privileges specified on the ELF sections that are
loaded.  Loaders will have a file descriptor available so an mmap of
the incoming page with the specified privileges should trigger the
required LSM interventions and tie them to a specific enclave.

The current enclave 'standard' also uses layout metadata, stored in a
special .notes section of the shared image, to direct a loader with
respect to construction of the enclave stack, heap, TCS and other
miscellaneous regions not directly coded by the ELF TEXT sections.  It
seems straight forward to extend this paradigm to declare region(s) of
an enclave that are eligible to be generated at runtime (EAUG'ed) with
the RWX protections needed to support dynamically loaded code.

If an enclave wishes to support this functionality, it would seem
straight forward to require an enclave to provide a single zero page
which the loader will mmap with those protections in order to trigger
the desired LSM checks against that specific enclave.

The simplest driver approach that achieves the desired introspection
of permissions in the described framework will implement as much LSM
security as is possible with SGX technology and with minimal
disruption to the existing SGX software eco-system.

This leaves the second aspect of LSM security and that is the ability
to inspect and act on the initialized characteristics of the enclave.
This is the aspect of SGX LSM functionality that has not been clearly
called out.

All that is needed here is an LSM hook that gets handed a pointer to
the signature structure (SIGSTRUCT) that is passed to the EINIT ioctl.
If the SIGSTRUCT does not match the proposed enclave image that the
processor has computed secondary to the enclave image creation process
the enclave will not initialize, so all that is needed is for an LSM
to be allowed to interpret and act on the characteristics defined in
that structure before the enclave is actually initialized.

As we have now collectively demonstrated, it is easy to get lost in
minutia with respect to all of this.  I believe if we can focus on a
solution that implements what I have discussed above we will achieve
as much as can be achieved with respect to platform security for SGX
systems.

Best wishes for a productive remainder of the week.

Dr. Greg

As always,
Dr. G.W. Wettstein, Ph.D.   Enjellic Systems Development, LLC.
4206 N. 19th Ave.           Specializing in information infra-structure
Fargo, ND  58102            development.
PH: 701-281-1686            EMAIL: greg@enjellic.com
------------------------------------------------------------------------------
"Nullum magnum ingenium sine mixtura dementiae fuit."
        (There is no great genius without some touch of madness.)
                                -- Seneca
Sean Christopherson June 12, 2019, 2:25 p.m. UTC | #5
On Wed, Jun 12, 2019 at 04:32:21AM -0500, Dr. Greg wrote:
> With SGX2 we will, by necessity, have to admit the notion that a
> platform owner will not have any effective visibility into code that
> is loaded and executed, since it can come in over a secured network
> connection in an enclave security context.  This advocates for the
> simplest approach possible to providing some type of regulation to any
> form of WX page access.

I believe we're all on the same page in the sense that we all want the
"simplest approach possible", but there's a sliding scale of complexity
between the kernel and userspace.  We can make life simple for userspace
at the cost of additional complexity in the kernel, and vice versa.  The
disagreement is over where to shove the extra complexity.

> Current state of the art, and there doesn't appear to be a reason to
> change this, is to package an enclave in the form of an ELF shared
> library.  It seems straight forward to inherit and act on page
> privileges from the privileges specified on the ELF sections that are
> loaded.  Loaders will have a file descriptor available so an mmap of
> the incoming page with the specified privileges should trigger the
> required LSM interventions and tie them to a specific enclave.
> 
> The current enclave 'standard' also uses layout metadata, stored in a
> special .notes section of the shared image, to direct a loader with
> respect to construction of the enclave stack, heap, TCS and other
> miscellaneous regions not directly coded by the ELF TEXT sections.  It
> seems straight forward to extend this paradigm to declare region(s) of
> an enclave that are eligible to be generated at runtime (EAUG'ed) with
> the RWX protections needed to support dynamically loaded code.
> 
> If an enclave wishes to support this functionality, it would seem
> straight forward to require an enclave to provide a single zero page
> which the loader will mmap with those protections in order to trigger
> the desired LSM checks against that specific enclave.

This is effectively #1, e.g. would require userspace to pre-declare its
intent to make regions W->X.
Andy Lutomirski June 12, 2019, 7:30 p.m. UTC | #6
On Tue, Jun 11, 2019 at 3:02 PM Sean Christopherson
<sean.j.christopherson@intel.com> wrote:
>
> On Tue, Jun 11, 2019 at 09:40:25AM -0400, Stephen Smalley wrote:
> > I haven't looked at this code closely, but it feels like a lot of
> > SGX-specific logic embedded into SELinux that will have to be repeated or
> > reused for every security module.  Does SGX not track this state itself?
>
> SGX does track equivalent state.
>
> There are three proposals on the table (I think):

Sounds about right.  I've been playing with #1 and #2 (as text, not
code), and I'll post my latest thoughts on it below.  But first, I
should mention that I think we've gotten a bit too caught up on
SELinux-y terminology like "EXECMOD" and "EXECMEM", which is relevant
since the kernel has very little visibility into what the enclave is
doing.  Instead, I think we should think about the relevant
permissions more like this:

a) "execute code from a particular source, e.g. a file"
b) "execute code supplied from arbitrary memory outside the enclave"
c) "execute code generated within the enclave"
d) "possess WX enclave memory"

I think that any sensible policy that allows (b) should allow (a).
Similarly, any policy that allows (d) should allow (c).   I don't see
any particular need for the kernel to go out of its way to ensure
these relationships, though.

We could plausibly also distinguish "execute measured code", although
I think that the details of defining and implenenting this, especially
with SGX2, could be nastier than we want to deal with.  A minimal
approach that mostly ignores SGX2 would be to have another permission
"execute code supplied from outside the enclave that was not
measured".  This permission would be required on top of (a) or (b),
depending on where that code comes from.

If we want to map these to existing SELinux terms, we could use
EXECUTE for (a), EXECMOD for (c), and EXECMEM for (d). (b) seems to
also map to EXECMOD or EXECMEM depending on exactly how it happens,
and I'm not sure this makes all that much sense.

>
>   1. Require userspace to explicitly specificy (maximal) enclave page
>      permissions at build time.  The enclave page permissions are provided
>      to, and checked by, LSMs at enclave build time.
>
>      Pros: Low-complexity kernel implementation, straightforward auditing
>      Cons: Sullies the SGX UAPI to some extent, may increase complexity of
>            SGX2 enclave loaders.

In my notes, this works like this.  This is similar, but not
identical, to what Sean has been sending out.

EADD takes flags: ALLOW_READ, ALLOW_WRITE, ALLOW_EXEC.  It calls a new hook:

  int security_enclave_load(struct vm_area_struct *source, unsigned int flags);

(Sean passed in the secinfo protection too, but I think we agreed
that this could be omitted.)  This hook will fail if ALLOW_EXEC is
requested and the LSM doesn't consider the source VMA to be
executable.  Privileges (a) and (b) are implemented here.

Optionally, we can enforce noexec here.

The future EAUG ioctl takes the same flags, but it doesn't call
security_enclave_load().  (As Cedric noted, the actual user API for EAUG
is not settled, but I don't think it makes much difference here.)

EINIT takes a sigstruct pointer.  SGX calls a new hook:

  unsigned int security_enclave_init(struct sigstruct *sigstruct,
struct vm_area_struct *source, unsigned int flags);

This hook can return -EPERM.  Otherwise it returns 0 or a combination of
flags DENY_WX and DENY_X_IF_ALLOW_WRITE.  The driver saves this value.
These represent permissions (c) and (d).

If we want to have a permission for "execute code supplied from
outside the enclave that was not measured", we could have a flag like
HAS_UNMEASURED_ALLOW_EXEC_PAGE that the LSM could consider.

mmap() and mprotect() enforce the following rules:

 - Deny if a PROT_ flag is requested but the corresponding ALLOW_ flag
   is not set for all pages in question.

 - Deny if PROT_WRITE, PROT_EXEC, and DENY_WX are all set.

 - Deny if PROT_EXEC, ALLOW_WRITE, and DENY_X_IF_ALLOW_WRITE are all set.

mprotect() and mmap() do *not* call SGX-specific LSM hooks to ask for
permission, although they can optionally call an LSM hook if they hit one of
the -EPERM cases for auditing purposes.


I think this model works quite well in an SGX1 world.  The main thing
that makes me uneasy about this model is that, in SGX2, it requires
that an SGX2-compatible enclave loader must pre-declare to the kernel
whether it intends for its dynamically allocated memory to be
ALLOW_EXEC.  If ALLOW_EXEC is set but not actually needed, it will
still fail if DENY_X_IF_ALLOW_WRITE ends up being set.  The other
version below does not have this limitation.

>
>   2. Pre-check LSM permissions and dynamically track mappings to enclave
>      pages, e.g. add an SGX mprotect() hook to restrict W->X and WX
>      based on the pre-checked permissions.
>
>      Pros: Does not impact SGX UAPI, medium kernel complexity
>      Cons: Auditing is complex/weird, requires taking enclave-specific
>            lock during mprotect() to query/update tracking.

Here's how this looks in my mind.  It's quite similar, except that
ALLOW_READ, ALLOW_WRITE, and ALLOW_EXEC are replaced with a little
state machine.

EADD does not take any special flags.  It calls this LSM hook:

  int security_enclave_load(struct vm_area_struct *source);

This hook can return -EPERM.  Otherwise it 0 or ALLOC_EXEC_IF_UNMODIFIED
(i.e. 1).  This hook enforces permissions (a) and (b).

The driver tracks a state for each page, and the possible states are:

 - CLEAN_MAYEXEC /* no W or X VMAs have existed, but X is okay */
 - CLEAN_NOEXEC /* no W or X VMAs have existed, and X is not okay */
 - CLEAN_EXEC /* no W VMA has existed, but an X VMA has existed */
 - DIRTY /* a W VMA has existed */

The initial state for a page is CLEAN_MAYEXEC if the hook said
ALLOW_EXEC_IF_UNMODIFIED and CLEAN_NOEXEC otherwise.

The future EAUG does not call a hook at all and puts pages into the state
CLEAN_NOEXEC.  If SGX3 or later ever adds EAUG-but-don't-clear, it can
call security_enclave_load() and add CLEAN_MAYEXEC pages if appropriate.

EINIT takes a sigstruct pointer.  SGX calls a new hook:

  unsigned int security_enclave_init(struct sigstruct *sigstruct,
struct vm_area_struct *source, unsigned int flags);

This hook can return -EPERM.  Otherwise it returns 0 or a combination of
flags DENY_WX and DENY_X_DIRTY.  The driver saves this value.
These represent permissions (c) and (d).

If we want to have a permission for "execute code supplied from outside the
enclave that was not measured", we could have a flag like
HAS_UNMEASURED_CLEAN_EXEC_PAGE that the LSM could consider.

mmap() and mprotect() enforce the following rules:

 - If VM_EXEC is requested and (either the page is DIRTY or VM_WRITE is
   requested) and DENY_X_DIRTY, then deny.

 - If VM_WRITE and VM_EXEC are both requested and DENY_WX, then deny.

 - If VM_WRITE is requested, we need to update the state.  If it was
   CLEAN_EXEC, then we reject if DENY_X_DIRTY.  Otherwise we change the
   state to DIRTY.

 - If VM_EXEC is requested and the page is CLEAN_NOEXEC, then deny.

mprotect() and mmap() do *not* call SGX-specific LSM hooks to ask for
permission, although they can optionally call an LSM hook if they hit one of
the -EPERM cases for auditing purposes.

Before the SIGSTRUCT is provided to the driver, the driver acts as though
DENY_X_DIRTY and DENY_WX are both set.
Sean Christopherson June 12, 2019, 10:02 p.m. UTC | #7
On Wed, Jun 12, 2019 at 12:30:20PM -0700, Andy Lutomirski wrote:
> On Tue, Jun 11, 2019 at 3:02 PM Sean Christopherson
> <sean.j.christopherson@intel.com> wrote:
> >
> >   1. Require userspace to explicitly specificy (maximal) enclave page
> >      permissions at build time.  The enclave page permissions are provided
> >      to, and checked by, LSMs at enclave build time.
> >
> >      Pros: Low-complexity kernel implementation, straightforward auditing
> >      Cons: Sullies the SGX UAPI to some extent, may increase complexity of
> >            SGX2 enclave loaders.
> 
> In my notes, this works like this.  This is similar, but not
> identical, to what Sean has been sending out.

...

> mmap() and mprotect() enforce the following rules:
> 
>  - Deny if a PROT_ flag is requested but the corresponding ALLOW_ flag
>    is not set for all pages in question.
> 
>  - Deny if PROT_WRITE, PROT_EXEC, and DENY_WX are all set.
> 
>  - Deny if PROT_EXEC, ALLOW_WRITE, and DENY_X_IF_ALLOW_WRITE are all set.
> 
> mprotect() and mmap() do *not* call SGX-specific LSM hooks to ask for
> permission, although they can optionally call an LSM hook if they hit one of
> the -EPERM cases for auditing purposes.

IMO, #1 only makes sense if it's stripped down to avoid auditing and
locking complications, i.e. gets a pass/fail at security_enclave_load()
and clears VM_MAY* flags during mmap().  If we want WX and W->X to be
differentiated by security_enclave_init() as opposed to
security_enclave_load(), then we should just scrap #1.

> I think this model works quite well in an SGX1 world.  The main thing
> that makes me uneasy about this model is that, in SGX2, it requires
> that an SGX2-compatible enclave loader must pre-declare to the kernel
> whether it intends for its dynamically allocated memory to be
> ALLOW_EXEC.  If ALLOW_EXEC is set but not actually needed, it will
> still fail if DENY_X_IF_ALLOW_WRITE ends up being set.  The other
> version below does not have this limitation.

I'm not convinced this will be a meaningful limitation in practice, though
that's probably obvious from my RFCs :-).  That being said, the UAPI quirk
is essentially a dealbreaker for multiple people, so let's drop #1.

I discussed the options with Cedric offline, and he is ok with option #2
*if* the idea actually translates to acceptable code and doesn't present
problems for userspace and/or future SGX features.

So, I'll work on an RFC series to implement #2 as described below.  If it
works out, yay!  If not, i.e. option #2 is fundamentally broken, I'll
shift my focus to Cedric's code (option #3).

> >   2. Pre-check LSM permissions and dynamically track mappings to enclave
> >      pages, e.g. add an SGX mprotect() hook to restrict W->X and WX
> >      based on the pre-checked permissions.
> >
> >      Pros: Does not impact SGX UAPI, medium kernel complexity
> >      Cons: Auditing is complex/weird, requires taking enclave-specific
> >            lock during mprotect() to query/update tracking.
> 
> Here's how this looks in my mind.  It's quite similar, except that
> ALLOW_READ, ALLOW_WRITE, and ALLOW_EXEC are replaced with a little
> state machine.
> 
> EADD does not take any special flags.  It calls this LSM hook:
> 
>   int security_enclave_load(struct vm_area_struct *source);
> 
> This hook can return -EPERM.  Otherwise it 0 or ALLOC_EXEC_IF_UNMODIFIED
> (i.e. 1).  This hook enforces permissions (a) and (b).
> 
> The driver tracks a state for each page, and the possible states are:
> 
>  - CLEAN_MAYEXEC /* no W or X VMAs have existed, but X is okay */
>  - CLEAN_NOEXEC /* no W or X VMAs have existed, and X is not okay */
>  - CLEAN_EXEC /* no W VMA has existed, but an X VMA has existed */
>  - DIRTY /* a W VMA has existed */
> 
> The initial state for a page is CLEAN_MAYEXEC if the hook said
> ALLOW_EXEC_IF_UNMODIFIED and CLEAN_NOEXEC otherwise.
> 
> The future EAUG does not call a hook at all and puts pages into the state
> CLEAN_NOEXEC.  If SGX3 or later ever adds EAUG-but-don't-clear, it can
> call security_enclave_load() and add CLEAN_MAYEXEC pages if appropriate.
> 
> EINIT takes a sigstruct pointer.  SGX calls a new hook:
> 
>   unsigned int security_enclave_init(struct sigstruct *sigstruct,
> struct vm_area_struct *source, unsigned int flags);
> 
> This hook can return -EPERM.  Otherwise it returns 0 or a combination of
> flags DENY_WX and DENY_X_DIRTY.  The driver saves this value.
> These represent permissions (c) and (d).
> 
> If we want to have a permission for "execute code supplied from outside the
> enclave that was not measured", we could have a flag like
> HAS_UNMEASURED_CLEAN_EXEC_PAGE that the LSM could consider.
>
> mmap() and mprotect() enforce the following rules:
> 
>  - If VM_EXEC is requested and (either the page is DIRTY or VM_WRITE is
>    requested) and DENY_X_DIRTY, then deny.
> 
>  - If VM_WRITE and VM_EXEC are both requested and DENY_WX, then deny.
> 
>  - If VM_WRITE is requested, we need to update the state.  If it was
>    CLEAN_EXEC, then we reject if DENY_X_DIRTY.  Otherwise we change the
>    state to DIRTY.
> 
>  - If VM_EXEC is requested and the page is CLEAN_NOEXEC, then deny.
> 
> mprotect() and mmap() do *not* call SGX-specific LSM hooks to ask for
> permission, although they can optionally call an LSM hook if they hit one of
> the -EPERM cases for auditing purposes.
> 
> Before the SIGSTRUCT is provided to the driver, the driver acts as though
> DENY_X_DIRTY and DENY_WX are both set.
Xing, Cedric June 13, 2019, 12:10 a.m. UTC | #8
> From: Christopherson, Sean J
> Sent: Wednesday, June 12, 2019 3:03 PM
> 
> > I think this model works quite well in an SGX1 world.  The main thing
> > that makes me uneasy about this model is that, in SGX2, it requires
> > that an SGX2-compatible enclave loader must pre-declare to the kernel
> > whether it intends for its dynamically allocated memory to be
> > ALLOW_EXEC.  If ALLOW_EXEC is set but not actually needed, it will
> > still fail if DENY_X_IF_ALLOW_WRITE ends up being set.  The other
> > version below does not have this limitation.
> 
> I'm not convinced this will be a meaningful limitation in practice,
> though that's probably obvious from my RFCs :-).  That being said, the
> UAPI quirk is essentially a dealbreaker for multiple people, so let's
> drop #1.
> 
> I discussed the options with Cedric offline, and he is ok with option #2
> *if* the idea actually translates to acceptable code and doesn't present
> problems for userspace and/or future SGX features.
> 
> So, I'll work on an RFC series to implement #2 as described below.  If
> it works out, yay!  If not, i.e. option #2 is fundamentally broken, I'll
> shift my focus to Cedric's code (option #3).
> 
> > >   2. Pre-check LSM permissions and dynamically track mappings to
> enclave
> > >      pages, e.g. add an SGX mprotect() hook to restrict W->X and WX
> > >      based on the pre-checked permissions.
> > >
> > >      Pros: Does not impact SGX UAPI, medium kernel complexity
> > >      Cons: Auditing is complex/weird, requires taking enclave-
> specific
> > >            lock during mprotect() to query/update tracking.
> >
> > Here's how this looks in my mind.  It's quite similar, except that
> > ALLOW_READ, ALLOW_WRITE, and ALLOW_EXEC are replaced with a little
> > state machine.
> >
> > EADD does not take any special flags.  It calls this LSM hook:
> >
> >   int security_enclave_load(struct vm_area_struct *source);
> >
> > This hook can return -EPERM.  Otherwise it 0 or
> > ALLOC_EXEC_IF_UNMODIFIED (i.e. 1).  This hook enforces permissions (a)
> and (b).
> >
> > The driver tracks a state for each page, and the possible states are:
> >
> >  - CLEAN_MAYEXEC /* no W or X VMAs have existed, but X is okay */
> >  - CLEAN_NOEXEC /* no W or X VMAs have existed, and X is not okay */
> >  - CLEAN_EXEC /* no W VMA has existed, but an X VMA has existed */
> >  - DIRTY /* a W VMA has existed */
> >
> > The initial state for a page is CLEAN_MAYEXEC if the hook said
> > ALLOW_EXEC_IF_UNMODIFIED and CLEAN_NOEXEC otherwise.
> >
> > The future EAUG does not call a hook at all and puts pages into the
> > state CLEAN_NOEXEC.  If SGX3 or later ever adds EAUG-but-don't-clear,
> > it can call security_enclave_load() and add CLEAN_MAYEXEC pages if
> appropriate.
> >
> > EINIT takes a sigstruct pointer.  SGX calls a new hook:
> >
> >   unsigned int security_enclave_init(struct sigstruct *sigstruct,
> > struct vm_area_struct *source, unsigned int flags);
> >
> > This hook can return -EPERM.  Otherwise it returns 0 or a combination
> > of flags DENY_WX and DENY_X_DIRTY.  The driver saves this value.
> > These represent permissions (c) and (d).
> >
> > If we want to have a permission for "execute code supplied from
> > outside the enclave that was not measured", we could have a flag like
> > HAS_UNMEASURED_CLEAN_EXEC_PAGE that the LSM could consider.
> >
> > mmap() and mprotect() enforce the following rules:
> >
> >  - If VM_EXEC is requested and (either the page is DIRTY or VM_WRITE
> is
> >    requested) and DENY_X_DIRTY, then deny.
> >
> >  - If VM_WRITE and VM_EXEC are both requested and DENY_WX, then deny.
> >
> >  - If VM_WRITE is requested, we need to update the state.  If it was
> >    CLEAN_EXEC, then we reject if DENY_X_DIRTY.  Otherwise we change
> the
> >    state to DIRTY.
> >
> >  - If VM_EXEC is requested and the page is CLEAN_NOEXEC, then deny.
> >
> > mprotect() and mmap() do *not* call SGX-specific LSM hooks to ask for
> > permission, although they can optionally call an LSM hook if they hit
> > one of the -EPERM cases for auditing purposes.
> >
> > Before the SIGSTRUCT is provided to the driver, the driver acts as
> > though DENY_X_DIRTY and DENY_WX are both set.

I think we've been discussing 2 topics simultaneously, one is the state machine that accepts/rejects mmap/mprotect requests, while the other is where is the best place to put it. I think we have an agreement on the former, and IMO option #2 and #3 differ only in the latter.

Option #2 keeps the state machine inside SGX subsystem, so it could reuse existing data structures for page tracking/locking to some extent. Sean may have smarter ideas, but it looks to me like the existing 'struct sgx_encl_page' tracks individual enclave pages while the FSM states apply to ranges. So in order *not* to test page by page in mmap/mprotect, I guess some new range oriented structures are still necessary. But I don't think it very important anyway. 

My major concern is more from the architecture/modularity perspective. Specifically, the state machine is defined by LSM but SGX does the state transitions. That's a brittle relationship that'd break easily if the state machine changes in future, or if different LSM modules want to define different FSMs (comprised of different set of states and/or triggers). After all, what's needed by the SGX subsystem is just the decision, not the FSM definition. I think we should take a closer look at this area once Sean's patch comes out.
Xing, Cedric June 13, 2019, 1:02 a.m. UTC | #9
> From: Christopherson, Sean J
> Sent: Wednesday, June 12, 2019 3:03 PM
> 
> > I think this model works quite well in an SGX1 world.  The main thing
> > that makes me uneasy about this model is that, in SGX2, it requires
> > that an SGX2-compatible enclave loader must pre-declare to the kernel
> > whether it intends for its dynamically allocated memory to be
> > ALLOW_EXEC.  If ALLOW_EXEC is set but not actually needed, it will
> > still fail if DENY_X_IF_ALLOW_WRITE ends up being set.  The other
> > version below does not have this limitation.
> 
> I'm not convinced this will be a meaningful limitation in practice,
> though that's probably obvious from my RFCs :-).  That being said, the
> UAPI quirk is essentially a dealbreaker for multiple people, so let's
> drop #1.
> 
> I discussed the options with Cedric offline, and he is ok with option #2
> *if* the idea actually translates to acceptable code and doesn't present
> problems for userspace and/or future SGX features.
> 
> So, I'll work on an RFC series to implement #2 as described below.  If
> it works out, yay!  If not, i.e. option #2 is fundamentally broken, I'll
> shift my focus to Cedric's code (option #3).
> 
> > >   2. Pre-check LSM permissions and dynamically track mappings to
> enclave
> > >      pages, e.g. add an SGX mprotect() hook to restrict W->X and WX
> > >      based on the pre-checked permissions.
> > >
> > >      Pros: Does not impact SGX UAPI, medium kernel complexity
> > >      Cons: Auditing is complex/weird, requires taking enclave-
> specific
> > >            lock during mprotect() to query/update tracking.
> >
> > Here's how this looks in my mind.  It's quite similar, except that
> > ALLOW_READ, ALLOW_WRITE, and ALLOW_EXEC are replaced with a little
> > state machine.
> >
> > EADD does not take any special flags.  It calls this LSM hook:
> >
> >   int security_enclave_load(struct vm_area_struct *source);
> >
> > This hook can return -EPERM.  Otherwise it 0 or
> > ALLOC_EXEC_IF_UNMODIFIED (i.e. 1).  This hook enforces permissions (a)
> and (b).
> >
> > The driver tracks a state for each page, and the possible states are:
> >
> >  - CLEAN_MAYEXEC /* no W or X VMAs have existed, but X is okay */
> >  - CLEAN_NOEXEC /* no W or X VMAs have existed, and X is not okay */
> >  - CLEAN_EXEC /* no W VMA has existed, but an X VMA has existed */
> >  - DIRTY /* a W VMA has existed */
> >
> > The initial state for a page is CLEAN_MAYEXEC if the hook said
> > ALLOW_EXEC_IF_UNMODIFIED and CLEAN_NOEXEC otherwise.
> >
> > The future EAUG does not call a hook at all and puts pages into the
> > state CLEAN_NOEXEC.  If SGX3 or later ever adds EAUG-but-don't-clear,
> > it can call security_enclave_load() and add CLEAN_MAYEXEC pages if
> appropriate.
> >
> > EINIT takes a sigstruct pointer.  SGX calls a new hook:
> >
> >   unsigned int security_enclave_init(struct sigstruct *sigstruct,
> > struct vm_area_struct *source, unsigned int flags);
> >
> > This hook can return -EPERM.  Otherwise it returns 0 or a combination
> > of flags DENY_WX and DENY_X_DIRTY.  The driver saves this value.
> > These represent permissions (c) and (d).
> >
> > If we want to have a permission for "execute code supplied from
> > outside the enclave that was not measured", we could have a flag like
> > HAS_UNMEASURED_CLEAN_EXEC_PAGE that the LSM could consider.
> >
> > mmap() and mprotect() enforce the following rules:
> >
> >  - If VM_EXEC is requested and (either the page is DIRTY or VM_WRITE
> is
> >    requested) and DENY_X_DIRTY, then deny.
> >
> >  - If VM_WRITE and VM_EXEC are both requested and DENY_WX, then deny.
> >
> >  - If VM_WRITE is requested, we need to update the state.  If it was
> >    CLEAN_EXEC, then we reject if DENY_X_DIRTY.  Otherwise we change
> the
> >    state to DIRTY.
> >
> >  - If VM_EXEC is requested and the page is CLEAN_NOEXEC, then deny.
> >
> > mprotect() and mmap() do *not* call SGX-specific LSM hooks to ask for
> > permission, although they can optionally call an LSM hook if they hit
> > one of the -EPERM cases for auditing purposes.
> >
> > Before the SIGSTRUCT is provided to the driver, the driver acts as
> > though DENY_X_DIRTY and DENY_WX are both set.

I think we've been discussing 2 topics simultaneously, one is the state machine that accepts/rejects mmap/mprotect requests, while the other is where is the best place to put it. I think we have an agreement on the former, and IMO option #2 and #3 differ only in the latter.

Option #2 keeps the state machine inside SGX subsystem, so it could reuse existing data structures for page tracking/locking to some extent. Sean may have smarter ideas, but it looks to me like the existing 'struct sgx_encl_page' tracks individual enclave pages while the FSM states apply to ranges. So in order *not* to test page by page in mmap/mprotect, I guess some new range oriented structures are still necessary. But I don't think it very important anyway. 

My major concern is more from the architecture/modularity perspective. Specifically, the state machine is defined by LSM but SGX does the state transitions. That's a brittle relationship that'd break easily if the state machine changes in future, or if different LSM modules want to define different FSMs (comprised of different set of states and/or triggers). After all, what's needed by the SGX subsystem is just the decision, not the FSM definition. I think we should take a closer look at this area once Sean's patch comes out.
Dr. Greg June 13, 2019, 7:25 a.m. UTC | #10
On Wed, Jun 12, 2019 at 07:25:57AM -0700, Sean Christopherson wrote:

Good morning, we hope the week continues to go well for everyone.

> On Wed, Jun 12, 2019 at 04:32:21AM -0500, Dr. Greg wrote:
> > With SGX2 we will, by necessity, have to admit the notion that a
> > platform owner will not have any effective visibility into code that
> > is loaded and executed, since it can come in over a secured network
> > connection in an enclave security context.  This advocates for the
> > simplest approach possible to providing some type of regulation to any
> > form of WX page access.

> I believe we're all on the same page in the sense that we all want
> the "simplest approach possible", but there's a sliding scale of
> complexity between the kernel and userspace.  We can make life
> simple for userspace at the cost of additional complexity in the
> kernel, and vice versa.  The disagreement is over where to shove the
> extra complexity.

Yes, we are certainly cognizant of and sympathetic to the engineering
tensions involved.

The purpose of our e-mail was to leaven the discussion with the notion
that the most important question is how much complexity should be
shoved in either direction.  With respect to SGX as a technology, the
most important engineering metric is how much effective security is
actually being achieved.

Given an admission that enclave dynamic memory management (EDMM/SGX2)
is the goal in all of this, there are only two effective security
questions to be answered:

1.) Should a corpus of known memory with executable permissions be
copied into to an enclave.

2.) Should a corpus of executable memory with unknown content be
available to an enclave.

Given the functionality that SGX implements, both questions ultimately
devolve to whether or not a platform owner trusts an enclave author.
Security relevant trust is conveyed through cryptographically mediated
mechanisms.

The decision has been made to take full hardware mediated
cryptographic trust off the table for the mainstream Linux
implementation.  Given that, the most pragmatic engineering solution
would seem to be to implement the least complex implementation that
allows a platform owner to answer the two questions above.

See below.

> > Current state of the art, and there doesn't appear to be a reason to
> > change this, is to package an enclave in the form of an ELF shared
> > library.  It seems straight forward to inherit and act on page
> > privileges from the privileges specified on the ELF sections that are
> > loaded.  Loaders will have a file descriptor available so an mmap of
> > the incoming page with the specified privileges should trigger the
> > required LSM interventions and tie them to a specific enclave.
> > 
> > The current enclave 'standard' also uses layout metadata, stored in a
> > special .notes section of the shared image, to direct a loader with
> > respect to construction of the enclave stack, heap, TCS and other
> > miscellaneous regions not directly coded by the ELF TEXT sections.  It
> > seems straight forward to extend this paradigm to declare region(s) of
> > an enclave that are eligible to be generated at runtime (EAUG'ed) with
> > the RWX protections needed to support dynamically loaded code.
> > 
> > If an enclave wishes to support this functionality, it would seem
> > straight forward to require an enclave to provide a single zero page
> > which the loader will mmap with those protections in order to trigger
> > the desired LSM checks against that specific enclave.

> This is effectively #1, e.g. would require userspace to pre-declare its
> intent to make regions W->X.

Yes, we understood that when we wrote our original e-mail.

This model effectively allows the two relevant security questions to
be easily answered and is most consistent with current enclave
formats, software practices and runtimes.  It is also largely
consistent with existing LSM practices.

There hasn't been any discussion with respect to backports of this
driver but we believe it it safe to conclude that the industry is
going to be at least two years away from any type of realistic
deployments of this driver.  By that time there will be over a half a
decade of software deployment of existing API's and enclave formats.

Expecting a 'flag day' to be successful would seem to be contrary to
all known history of software practice and would thus disadvantage
Linux as an effective platform for this technology.

Best wishes for a productive remainder of the week to everyone.

Dr. Greg

As always,
Dr. G.W. Wettstein, Ph.D.   Enjellic Systems Development, LLC.
4206 N. 19th Ave.           Specializing in information infra-structure
Fargo, ND  58102            development.
PH: 701-281-1686            EMAIL: greg@enjellic.com
------------------------------------------------------------------------------
"Sometimes I sing and dance around the house in my underwear,
 doesn't make me Madonna, never will.
                                -- Cyn
                                   Working Girl
Stephen Smalley June 13, 2019, 5:02 p.m. UTC | #11
On 6/11/19 6:02 PM, Sean Christopherson wrote:
> On Tue, Jun 11, 2019 at 09:40:25AM -0400, Stephen Smalley wrote:
>> I haven't looked at this code closely, but it feels like a lot of
>> SGX-specific logic embedded into SELinux that will have to be repeated or
>> reused for every security module.  Does SGX not track this state itself?
> 
> SGX does track equivalent state.
> 
> There are three proposals on the table (I think):
> 
>    1. Require userspace to explicitly specificy (maximal) enclave page
>       permissions at build time.  The enclave page permissions are provided
>       to, and checked by, LSMs at enclave build time.
> 
>       Pros: Low-complexity kernel implementation, straightforward auditing
>       Cons: Sullies the SGX UAPI to some extent, may increase complexity of
>             SGX2 enclave loaders.
> 
>    2. Pre-check LSM permissions and dynamically track mappings to enclave
>       pages, e.g. add an SGX mprotect() hook to restrict W->X and WX
>       based on the pre-checked permissions.
> 
>       Pros: Does not impact SGX UAPI, medium kernel complexity
>       Cons: Auditing is complex/weird, requires taking enclave-specific
>             lock during mprotect() to query/update tracking.
> 
>    3. Implement LSM hooks in SGX to allow LSMs to track enclave regions
>       from cradle to grave, but otherwise defer everything to LSMs.
> 
>       Pros: Does not impact SGX UAPI, maximum flexibility, precise auditing
>       Cons: Most complex and "heaviest" kernel implementation of the three,
>             pushes more SGX details into LSMs.
> 
> My RFC series[1] implements #1.  My understanding is that Andy (Lutomirski)
> prefers #2.  Cedric's RFC series implements #3.
> 
> Perhaps the easiest way to make forward progress is to rule out the
> options we absolutely *don't* want by focusing on the potentially blocking
> issue with each option:
> 
>    #1 - SGX UAPI funkiness
> 
>    #2 - Auditing complexity, potential enclave lock contention
> 
>    #3 - Pushing SGX details into LSMs and complexity of kernel implementation
> 
> 
> [1] https://lkml.kernel.org/r/20190606021145.12604-1-sean.j.christopherson@intel.com

Given the complexity tradeoff, what is the clear motivating example for 
why #1 isn't the obvious choice? That the enclave loader has no way of 
knowing a priori whether the enclave will require W->X or WX?  But 
aren't we better off requiring enclaves to be explicitly marked as 
needing such so that we can make a more informed decision about whether 
to load them in the first place?
Stephen Smalley June 13, 2019, 6 p.m. UTC | #12
On 6/11/19 6:55 PM, Xing, Cedric wrote:
>> From: linux-sgx-owner@vger.kernel.org [mailto:linux-sgx-
>> owner@vger.kernel.org] On Behalf Of Stephen Smalley
>> Sent: Tuesday, June 11, 2019 6:40 AM
>>
>>>
>>> +#ifdef CONFIG_INTEL_SGX
>>> +	rc = sgxsec_mprotect(vma, prot);
>>> +	if (rc <= 0)
>>> +		return rc;
>>
>> Why are you skipping the file_map_prot_check() call when rc == 0?
>> What would SELinux check if you didn't do so -
>> FILE__READ|FILE__WRITE|FILE__EXECUTE to /dev/sgx/enclave?  Is it a
>> problem to let SELinux proceed with that check?
> 
> We can continue the check. But in practice, all FILE__{READ|WRITE|EXECUTE} are needed for every enclave, then what's the point of checking them? FILE__EXECMOD may be the only flag that has a meaning, but it's kind of redundant because sigstruct file was checked against that already.

I don't believe FILE__EXECMOD will be checked since it is a shared file 
mapping.  We'll check at least FILE__READ and FILE__WRITE anyway upon 
open(), and possibly FILE__EXECUTE upon mmap() unless that is never 
PROT_EXEC.  We want the policy to accurately reflect the operations of 
the system, even when an operation "must" be allowed, and even here this 
only needs to be allowed to processes authorized as enclave loaders, not 
to all processes.

I don't think there are other examples where we skip a SELinux check 
like this.  If we were to do so here, we would at least need a comment 
explaining that it was intentional and why.  The risk would be that 
future checking added into file_map_prot_check() would be unwittingly 
bypassed for these mappings.  A warning there would also be advisable if 
we skip it for these mappings.

> 
>>> +static int selinux_enclave_load(struct file *encl, unsigned long addr,
>>> +				unsigned long size, unsigned long prot,
>>> +				struct vm_area_struct *source)
>>> +{
>>> +	if (source) {
>>> +		/**
>>> +		 * Adding page from source => EADD request
>>> +		 */
>>> +		int rc = selinux_file_mprotect(source, prot, prot);
>>> +		if (rc)
>>> +			return rc;
>>> +
>>> +		if (!(prot & VM_EXEC) &&
>>> +		    selinux_file_mprotect(source, VM_EXEC, VM_EXEC))
>>
>> I wouldn't conflate VM_EXEC with PROT_EXEC even if they happen to be
>> defined with the same values currently.  Elsewhere the kernel appears to
>> explicitly translate them ala calc_vm_prot_bits().
> 
> Thanks! I'd change them to PROT_EXEC in the next version.
> 
>>
>> Also, this will mean that we will always perform an execute check on all
>> sources, thereby triggering audit denial messages for any EADD sources
>> that are only intended to be data.  Depending on the source, this could
>> trigger PROCESS__EXECMEM or FILE__EXECMOD or FILE__EXECUTE.  In a world
>> where users often just run any denials they see through audit2allow,
>> they'll end up always allowing them all.  How can they tell whether it
>> was needed? It would be preferable if we could only trigger execute
>> checks when there is some probability that execute will be requested in
>> the future.  Alternatives would be to silence the audit of these
>> permission checks always via use of _noaudit() interfaces or to silence
>> audit of these permissions via dontaudit rules in policy, but the latter
>> would hide all denials of the permission by the process, not just those
>> triggered from security_enclave_load().  And if we silence them, then we
>> won't see them even if they were needed.
> 
> *_noaudit() is exactly what I wanted. But I couldn't find selinux_file_mprotect_noaudit()/file_has_perm_noaudit(), and I'm reluctant to duplicate code. Any suggestions?

I would have no objection to adding _noaudit() variants of these, either 
duplicating code (if sufficiently small/simple) or creating a common 
helper with a bool audit flag that gets used for both. But the larger 
issue would be to resolve how to ultimately ensure that a denial is 
audited later if the denied permission is actually requested and blocked 
via sgxsec_mprotect().

>   
>>
>>> +			prot = 0;
>>> +		else {
>>> +			prot = SGX__EXECUTE;
>>> +			if (source->vm_file &&
>>> +			    !file_has_perm(current_cred(), source->vm_file,
>>> +					   FILE__EXECMOD))
>>> +				prot |= SGX__EXECMOD;
>>
>> Similarly, this means that we will always perform a FILE__EXECMOD check
>> on all executable sources, triggering audit denial messages for any EADD
>> source that is executable but to which EXECMOD is not allowed, and again
>> the most common pattern will be that users will add EXECMOD to all
>> executable sources to avoid this.
>>
>>> +		}
>>> +		return sgxsec_eadd(encl, addr, size, prot);
>>> +	} else {
>>> +		/**
>>> +		  * Adding page from NULL => EAUG request
>>> +		  */
>>> +		return sgxsec_eaug(encl, addr, size, prot);
>>> +	}
>>> +}
>>> +
>>> +static int selinux_enclave_init(struct file *encl,
>>> +				const struct sgx_sigstruct *sigstruct,
>>> +				struct vm_area_struct *vma)
>>> +{
>>> +	int rc = 0;
>>> +
>>> +	if (!vma)
>>> +		rc = -EINVAL;
>>
>> Is it ever valid to call this hook with a NULL vma?  If not, this should
>> be handled/prevented by the caller.  If so, I'd just return -EINVAL
>> immediately here.
> 
> vma shall never be NULL. I'll update it in the next version.
> 
>>
>>> +
>>> +	if (!rc && !(vma->vm_flags & VM_EXEC))
>>> +		rc = selinux_file_mprotect(vma, VM_EXEC, VM_EXEC);
>>
>> I had thought we were trying to avoid overloading FILE__EXECUTE (or
>> whatever gets checked here, e.g. could be PROCESS__EXECMEM or
>> FILE__EXECMOD) on the sigstruct file, since the caller isn't truly
>> executing code from it.
> 
> Agreed. Another problem with FILE__EXECMOD on the sigstruct file is that user code would then be allowed to modify SIGSTRUCT at will, which effectively wipes out the protection provided by FILE__EXECUTE.
> 
>>
>> I'd define new ENCLAVE__* permissions, including an up-front
>> ENCLAVE__INIT permission that governs whether the sigstruct file can be
>> used at all irrespective of memory protections.
> 
> Agreed.
> 
>>
>> Then you can also have ENCLAVE__EXECUTE, ENCLAVE__EXECMEM,
>> ENCLAVE__EXECMOD for the execute-related checks.  Or you can use the
>> /dev/sgx/enclave inode as the target for the execute checks and just
>> reuse the file permissions there.
> 
> Now we've got 2 options - 1) New ENCLAVE__* flags on sigstruct file or 2) FILE__* on /dev/sgx/enclave. Which one do you think makes more sense?
> 
> ENCLAVE__EXECMEM seems to offer finer granularity (than PROCESS__EXECMEM) but I wonder if it'd have any real use in practice.

Defining a separate ENCLAVE__EXECUTE and using it here for the sigstruct 
file would avoid any ambiguity with the FILE__EXECUTE check to the 
/dev/sgx/enclave inode that might occur upon mmap() or mprotect().  A 
separate ENCLAVE__EXECMEM would enable allowing WX within the enclave 
while denying it in the host application or vice versa, which could be a 
good thing for security, particularly if SGX2 largely ends up always 
wanting WX.

> 
>>> +int sgxsec_mprotect(struct vm_area_struct *vma, size_t prot) {
>>> +	struct enclave_sec *esec;
>>> +	int rc;
>>> +
>>> +	if (!vma->vm_file || !(esec = __esec(selinux_file(vma->vm_file))))
>> {
>>> +		/* Positive return value indicates non-enclave VMA */
>>> +		return 1;
>>> +	}
>>> +
>>> +	down_read(&esec->sem);
>>> +	rc = enclave_mprotect(&esec->regions, vma->vm_start, vma->vm_end,
>>> +prot);
>>
>> Why is it safe for this to only use down_read()? enclave_mprotect() can
>> call enclave_prot_set_cb() which modifies the list?
> 
> Probably because it was too late at night when I wrote this line:-( Good catch!
> 
>>
>> I haven't looked at this code closely, but it feels like a lot of SGX-
>> specific logic embedded into SELinux that will have to be repeated or
>> reused for every security module.  Does SGX not track this state itself?
> 
> I can tell you have looked quite closely, and I truly think you for your time!
> 
> You are right that there are SGX specific stuff. More precisely, SGX enclaves don't have access to anything except memory, so there are only 3 questions that need to be answered for each enclave page: 1) whether X is allowed; 2) whether W->X is allowed and 3 whether WX is allowed. This proposal tries to cache the answers to those questions upon creation of each enclave page, meaning it involves a) figuring out the answers and b) "remember" them for every page. #b is generic, mostly captured in intel_sgx.c, and could be shared among all LSM modules; while #a is SELinux specific. I could move intel_sgx.c up one level in the directory hierarchy if that's what you'd suggest.
> 
> By "SGX", did you mean the SGX subsystem being upstreamed? It doesn’t track that state. In practice, there's no way for SGX to track it because there's no vm_ops->may_mprotect() callback. It doesn't follow the philosophy of Linux either, as mprotect() doesn't track it for regular memory. And it doesn't have a use without LSM, so I believe it makes more sense to track it inside LSM.

Yes, the SGX driver/subsystem.  I had the impression from Sean that it 
does track this kind of per-page state already in some manner, but 
possibly he means it does under a given proposal and not in the current 
driver.

Even the #b remembering might end up being SELinux-specific if we also 
have to remember the original inputs used to compute the answer so that 
we can audit that information when access is denied later upon 
mprotect().  At the least we'd need it to save some opaque data and pass 
it to a callback into SELinux to perform that auditing.
Sean Christopherson June 13, 2019, 7:48 p.m. UTC | #13
On Thu, Jun 13, 2019 at 02:00:29PM -0400, Stephen Smalley wrote:
> On 6/11/19 6:55 PM, Xing, Cedric wrote:
> >You are right that there are SGX specific stuff. More precisely, SGX
> >enclaves don't have access to anything except memory, so there are only 3
> >questions that need to be answered for each enclave page: 1) whether X is
> >allowed; 2) whether W->X is allowed and 3 whether WX is allowed. This
> >proposal tries to cache the answers to those questions upon creation of each
> >enclave page, meaning it involves a) figuring out the answers and b)
> >"remember" them for every page. #b is generic, mostly captured in
> >intel_sgx.c, and could be shared among all LSM modules; while #a is SELinux
> >specific. I could move intel_sgx.c up one level in the directory hierarchy
> >if that's what you'd suggest.
> >
> >By "SGX", did you mean the SGX subsystem being upstreamed? It doesn’t track
> >that state. In practice, there's no way for SGX to track it because there's
> >no vm_ops->may_mprotect() callback. It doesn't follow the philosophy of
> >Linux either, as mprotect() doesn't track it for regular memory. And it
> >doesn't have a use without LSM, so I believe it makes more sense to track it
> >inside LSM.
> 
> Yes, the SGX driver/subsystem.  I had the impression from Sean that it does
> track this kind of per-page state already in some manner, but possibly he
> means it does under a given proposal and not in the current driver.

Yeah, under a given proposal.  SGX has per-page tracking, adding new flags
is fairly easy.  Philosophical objections aside, adding .may_mprotect() is
trivial.

> Even the #b remembering might end up being SELinux-specific if we also have
> to remember the original inputs used to compute the answer so that we can
> audit that information when access is denied later upon mprotect().  At the
> least we'd need it to save some opaque data and pass it to a callback into
> SELinux to perform that auditing.
Xing, Cedric June 13, 2019, 9:02 p.m. UTC | #14
> From: linux-sgx-owner@vger.kernel.org [mailto:linux-sgx-
> owner@vger.kernel.org] On Behalf Of Stephen Smalley
> 
> On 6/11/19 6:55 PM, Xing, Cedric wrote:
> >> From: linux-sgx-owner@vger.kernel.org [mailto:linux-sgx-
> >> owner@vger.kernel.org] On Behalf Of Stephen Smalley
> >> Sent: Tuesday, June 11, 2019 6:40 AM
> >>
> >>>
> >>> +#ifdef CONFIG_INTEL_SGX
> >>> +	rc = sgxsec_mprotect(vma, prot);
> >>> +	if (rc <= 0)
> >>> +		return rc;
> >>
> >> Why are you skipping the file_map_prot_check() call when rc == 0?
> >> What would SELinux check if you didn't do so -
> >> FILE__READ|FILE__WRITE|FILE__EXECUTE to /dev/sgx/enclave?  Is it a
> >> problem to let SELinux proceed with that check?
> >
> > We can continue the check. But in practice, all
> FILE__{READ|WRITE|EXECUTE} are needed for every enclave, then what's the
> point of checking them? FILE__EXECMOD may be the only flag that has a
> meaning, but it's kind of redundant because sigstruct file was checked
> against that already.
> 
> I don't believe FILE__EXECMOD will be checked since it is a shared file
> mapping.  We'll check at least FILE__READ and FILE__WRITE anyway upon
> open(), and possibly FILE__EXECUTE upon mmap() unless that is never
> PROT_EXEC.  We want the policy to accurately reflect the operations of
> the system, even when an operation "must" be allowed, and even here this
> only needs to be allowed to processes authorized as enclave loaders, not
> to all processes.
> 
> I don't think there are other examples where we skip a SELinux check
> like this.  If we were to do so here, we would at least need a comment
> explaining that it was intentional and why.  The risk would be that
> future checking added into file_map_prot_check() would be unwittingly
> bypassed for these mappings.  A warning there would also be advisable if
> we skip it for these mappings.

You are right! The code was written assuming file_map_prot_check() wouldn't object if sgxsec_mprotect() approves it, but that may not always be the case if new checks are added in future. I'll add the check back.
 
> 
> >
> >>> +static int selinux_enclave_load(struct file *encl, unsigned long
> addr,
> >>> +				unsigned long size, unsigned long prot,
> >>> +				struct vm_area_struct *source)
> >>> +{
> >>> +	if (source) {
> >>> +		/**
> >>> +		 * Adding page from source => EADD request
> >>> +		 */
> >>> +		int rc = selinux_file_mprotect(source, prot, prot);
> >>> +		if (rc)
> >>> +			return rc;
> >>> +
> >>> +		if (!(prot & VM_EXEC) &&
> >>> +		    selinux_file_mprotect(source, VM_EXEC, VM_EXEC))
> >>
> >> I wouldn't conflate VM_EXEC with PROT_EXEC even if they happen to be
> >> defined with the same values currently.  Elsewhere the kernel appears
> >> to explicitly translate them ala calc_vm_prot_bits().
> >
> > Thanks! I'd change them to PROT_EXEC in the next version.
> >
> >>
> >> Also, this will mean that we will always perform an execute check on
> >> all sources, thereby triggering audit denial messages for any EADD
> >> sources that are only intended to be data.  Depending on the source,
> >> this could trigger PROCESS__EXECMEM or FILE__EXECMOD or
> >> FILE__EXECUTE.  In a world where users often just run any denials
> >> they see through audit2allow, they'll end up always allowing them
> >> all.  How can they tell whether it was needed? It would be preferable
> >> if we could only trigger execute checks when there is some
> >> probability that execute will be requested in the future.
> >> Alternatives would be to silence the audit of these permission checks
> >> always via use of _noaudit() interfaces or to silence audit of these
> >> permissions via dontaudit rules in policy, but the latter would hide
> >> all denials of the permission by the process, not just those
> >> triggered from security_enclave_load().  And if we silence them, then
> we won't see them even if they were needed.
> >
> > *_noaudit() is exactly what I wanted. But I couldn't find
> selinux_file_mprotect_noaudit()/file_has_perm_noaudit(), and I'm
> reluctant to duplicate code. Any suggestions?
> 
> I would have no objection to adding _noaudit() variants of these, either
> duplicating code (if sufficiently small/simple) or creating a common
> helper with a bool audit flag that gets used for both. But the larger
> issue would be to resolve how to ultimately ensure that a denial is
> audited later if the denied permission is actually requested and blocked
> via sgxsec_mprotect().

The idea here is to precompute the answers as if a certain request were received, so that we don't have to store all inputs to the precomputation. sgxsec_mprotect(), if coded correctly, would make the same decision regardless it was precomputed or computed at the time of the real request. Auditing requires more information than making the decision itself, such as the file path and when the request was made. I'm reluctant to keep the source files open just for audit logs. I'll need a closer look at the auditing code to figure out an appropriate way.

> 
> >
> >>
> >>> +			prot = 0;
> >>> +		else {
> >>> +			prot = SGX__EXECUTE;
> >>> +			if (source->vm_file &&
> >>> +			    !file_has_perm(current_cred(), source->vm_file,
> >>> +					   FILE__EXECMOD))
> >>> +				prot |= SGX__EXECMOD;
> >>
> >> Similarly, this means that we will always perform a FILE__EXECMOD
> check
> >> on all executable sources, triggering audit denial messages for any
> EADD
> >> source that is executable but to which EXECMOD is not allowed, and
> again
> >> the most common pattern will be that users will add EXECMOD to all
> >> executable sources to avoid this.
> >>
> >>> +		}
> >>> +		return sgxsec_eadd(encl, addr, size, prot);
> >>> +	} else {
> >>> +		/**
> >>> +		  * Adding page from NULL => EAUG request
> >>> +		  */
> >>> +		return sgxsec_eaug(encl, addr, size, prot);
> >>> +	}
> >>> +}
> >>> +
> >>> +static int selinux_enclave_init(struct file *encl,
> >>> +				const struct sgx_sigstruct *sigstruct,
> >>> +				struct vm_area_struct *vma)
> >>> +{
> >>> +	int rc = 0;
> >>> +
> >>> +	if (!vma)
> >>> +		rc = -EINVAL;
> >>
> >> Is it ever valid to call this hook with a NULL vma?  If not, this
> should
> >> be handled/prevented by the caller.  If so, I'd just return -EINVAL
> >> immediately here.
> >
> > vma shall never be NULL. I'll update it in the next version.
> >
> >>
> >>> +
> >>> +	if (!rc && !(vma->vm_flags & VM_EXEC))
> >>> +		rc = selinux_file_mprotect(vma, VM_EXEC, VM_EXEC);
> >>
> >> I had thought we were trying to avoid overloading FILE__EXECUTE (or
> >> whatever gets checked here, e.g. could be PROCESS__EXECMEM or
> >> FILE__EXECMOD) on the sigstruct file, since the caller isn't truly
> >> executing code from it.
> >
> > Agreed. Another problem with FILE__EXECMOD on the sigstruct file is
> that user code would then be allowed to modify SIGSTRUCT at will, which
> effectively wipes out the protection provided by FILE__EXECUTE.
> >
> >>
> >> I'd define new ENCLAVE__* permissions, including an up-front
> >> ENCLAVE__INIT permission that governs whether the sigstruct file can
> be
> >> used at all irrespective of memory protections.
> >
> > Agreed.
> >
> >>
> >> Then you can also have ENCLAVE__EXECUTE, ENCLAVE__EXECMEM,
> >> ENCLAVE__EXECMOD for the execute-related checks.  Or you can use the
> >> /dev/sgx/enclave inode as the target for the execute checks and just
> >> reuse the file permissions there.
> >
> > Now we've got 2 options - 1) New ENCLAVE__* flags on sigstruct file or
> 2) FILE__* on /dev/sgx/enclave. Which one do you think makes more sense?
> >
> > ENCLAVE__EXECMEM seems to offer finer granularity (than
> PROCESS__EXECMEM) but I wonder if it'd have any real use in practice.
> 
> Defining a separate ENCLAVE__EXECUTE and using it here for the sigstruct
> file would avoid any ambiguity with the FILE__EXECUTE check to the
> /dev/sgx/enclave inode that might occur upon mmap() or mprotect().  A
> separate ENCLAVE__EXECMEM would enable allowing WX within the enclave
> while denying it in the host application or vice versa, which could be a
> good thing for security, particularly if SGX2 largely ends up always
> wanting WX.

Agreed. I'll include those new flags in my next version.

> 
> >
> >>> +int sgxsec_mprotect(struct vm_area_struct *vma, size_t prot) {
> >>> +	struct enclave_sec *esec;
> >>> +	int rc;
> >>> +
> >>> +	if (!vma->vm_file || !(esec = __esec(selinux_file(vma->vm_file))))
> >> {
> >>> +		/* Positive return value indicates non-enclave VMA */
> >>> +		return 1;
> >>> +	}
> >>> +
> >>> +	down_read(&esec->sem);
> >>> +	rc = enclave_mprotect(&esec->regions, vma->vm_start, vma->vm_end,
> >>> +prot);
> >>
> >> Why is it safe for this to only use down_read()? enclave_mprotect()
> can
> >> call enclave_prot_set_cb() which modifies the list?
> >
> > Probably because it was too late at night when I wrote this line:-
> ( Good catch!
> >
> >>
> >> I haven't looked at this code closely, but it feels like a lot of
> SGX-
> >> specific logic embedded into SELinux that will have to be repeated or
> >> reused for every security module.  Does SGX not track this state
> itself?
> >
> > I can tell you have looked quite closely, and I truly think you for
> your time!
> >
> > You are right that there are SGX specific stuff. More precisely, SGX
> enclaves don't have access to anything except memory, so there are only
> 3 questions that need to be answered for each enclave page: 1) whether X
> is allowed; 2) whether W->X is allowed and 3 whether WX is allowed. This
> proposal tries to cache the answers to those questions upon creation of
> each enclave page, meaning it involves a) figuring out the answers and b)
> "remember" them for every page. #b is generic, mostly captured in
> intel_sgx.c, and could be shared among all LSM modules; while #a is
> SELinux specific. I could move intel_sgx.c up one level in the directory
> hierarchy if that's what you'd suggest.
> >
> > By "SGX", did you mean the SGX subsystem being upstreamed? It doesn’t
> track that state. In practice, there's no way for SGX to track it
> because there's no vm_ops->may_mprotect() callback. It doesn't follow
> the philosophy of Linux either, as mprotect() doesn't track it for
> regular memory. And it doesn't have a use without LSM, so I believe it
> makes more sense to track it inside LSM.
> 
> Yes, the SGX driver/subsystem.  I had the impression from Sean that it
> does track this kind of per-page state already in some manner, but
> possibly he means it does under a given proposal and not in the current
> driver.

Yes, SGX subsystem does track per-page states. But this page protection flags apply to *ranges*. 

In practice, those per-page states are *not* checked at mmap/mprotect. They are used mainly by vm_ops->fault() and the page swapper thread.

That said, merging protection flags into per-page states will require page-by-page checks, which will definitely hurt performance. Unless the driver also maintains some range oriented structures just like what you see here.

> 
> Even the #b remembering might end up being SELinux-specific if we also
> have to remember the original inputs used to compute the answer so that
> we can audit that information when access is denied later upon
> mprotect().  At the least we'd need it to save some opaque data and pass
> it to a callback into SELinux to perform that auditing.

Agreed. What's commonly needed here is a data structure that supports setting/querying value on ranges. It's close to what xarray supports, but xarray doesn't support range querying.
Xing, Cedric June 13, 2019, 9:09 p.m. UTC | #15
> From: Christopherson, Sean J
> Sent: Thursday, June 13, 2019 12:49 PM
> 
> > >By "SGX", did you mean the SGX subsystem being upstreamed? It doesn’t
> > >track that state. In practice, there's no way for SGX to track it
> > >because there's no vm_ops->may_mprotect() callback. It doesn't follow
> > >the philosophy of Linux either, as mprotect() doesn't track it for
> > >regular memory. And it doesn't have a use without LSM, so I believe
> > >it makes more sense to track it inside LSM.
> >
> > Yes, the SGX driver/subsystem.  I had the impression from Sean that it
> > does track this kind of per-page state already in some manner, but
> > possibly he means it does under a given proposal and not in the
> current driver.
> 
> Yeah, under a given proposal.  SGX has per-page tracking, adding new
> flags is fairly easy.  Philosophical objections aside,
> adding .may_mprotect() is trivial.

As I pointed out in an earlier email, protection flags are associated with ranges. They could of course be duplicated to every page but that will hurt performance because every page within the range would have to be tested individually.

Furthermore, though .may_protect()is able to make the decision, I don't think it can do the audit log as well, unless it is coded in an SELinux specific way. Then I wonder how it could work with LSM modules other than SELinux.

> 
> > Even the #b remembering might end up being SELinux-specific if we also
> > have to remember the original inputs used to compute the answer so
> > that we can audit that information when access is denied later upon
> > mprotect().  At the least we'd need it to save some opaque data and
> > pass it to a callback into SELinux to perform that auditing.
Xing, Cedric June 13, 2019, 11:03 p.m. UTC | #16
> From: Stephen Smalley [mailto:sds@tycho.nsa.gov]
> Sent: Thursday, June 13, 2019 10:02 AM
> 
> On 6/11/19 6:02 PM, Sean Christopherson wrote:
> > On Tue, Jun 11, 2019 at 09:40:25AM -0400, Stephen Smalley wrote:
> >> I haven't looked at this code closely, but it feels like a lot of
> >> SGX-specific logic embedded into SELinux that will have to be
> >> repeated or reused for every security module.  Does SGX not track
> this state itself?
> >
> > SGX does track equivalent state.
> >
> > There are three proposals on the table (I think):
> >
> >    1. Require userspace to explicitly specificy (maximal) enclave page
> >       permissions at build time.  The enclave page permissions are
> provided
> >       to, and checked by, LSMs at enclave build time.
> >
> >       Pros: Low-complexity kernel implementation, straightforward
> auditing
> >       Cons: Sullies the SGX UAPI to some extent, may increase
> complexity of
> >             SGX2 enclave loaders.
> >
> >    2. Pre-check LSM permissions and dynamically track mappings to
> enclave
> >       pages, e.g. add an SGX mprotect() hook to restrict W->X and WX
> >       based on the pre-checked permissions.
> >
> >       Pros: Does not impact SGX UAPI, medium kernel complexity
> >       Cons: Auditing is complex/weird, requires taking enclave-
> specific
> >             lock during mprotect() to query/update tracking.
> >
> >    3. Implement LSM hooks in SGX to allow LSMs to track enclave
> regions
> >       from cradle to grave, but otherwise defer everything to LSMs.
> >
> >       Pros: Does not impact SGX UAPI, maximum flexibility, precise
> auditing
> >       Cons: Most complex and "heaviest" kernel implementation of the
> three,
> >             pushes more SGX details into LSMs.
> >
> > My RFC series[1] implements #1.  My understanding is that Andy
> > (Lutomirski) prefers #2.  Cedric's RFC series implements #3.
> >
> > Perhaps the easiest way to make forward progress is to rule out the
> > options we absolutely *don't* want by focusing on the potentially
> > blocking issue with each option:
> >
> >    #1 - SGX UAPI funkiness
> >
> >    #2 - Auditing complexity, potential enclave lock contention
> >
> >    #3 - Pushing SGX details into LSMs and complexity of kernel
> > implementation
> >
> >
> > [1]
> > https://lkml.kernel.org/r/20190606021145.12604-1-sean.j.christopherson
> > @intel.com
> 
> Given the complexity tradeoff, what is the clear motivating example for
> why #1 isn't the obvious choice? That the enclave loader has no way of
> knowing a priori whether the enclave will require W->X or WX?  But
> aren't we better off requiring enclaves to be explicitly marked as
> needing such so that we can make a more informed decision about whether
> to load them in the first place?

Are you asking this question at a) page granularity, b) file granularity or c) enclave (potentially comprised of multiple executable files) granularity?

#b is what we have on regular executable files and shared objects (i.e. FILE__EXECMOD). We all know how to do that.

#c is kind of new but could be done via some proxy file (e.g. sigstruct file) hence reduced to #b.

#a is problematic. It'd require compilers/linkers to generate such information, and proper executable image file format to carry that information, to be eventually picked up the loader. SELinux doesn't have PAGE__EXECMOD I guess is because it is generally considered impractical.

Option #1 however requires #a because the driver doesn't track which page was loaded from which file, otherwise it can no longer be qualified "simple". Or we could just implement #c, which will make all options simpler. But I guess #b is still preferred, to be aligned with what SELinux is enforcing today on regular memory pages.
Sean Christopherson June 13, 2019, 11:17 p.m. UTC | #17
On Thu, Jun 13, 2019 at 04:03:24PM -0700, Xing, Cedric wrote:
> > From: Stephen Smalley [mailto:sds@tycho.nsa.gov]
> > Sent: Thursday, June 13, 2019 10:02 AM
> > 
> > > My RFC series[1] implements #1.  My understanding is that Andy
> > > (Lutomirski) prefers #2.  Cedric's RFC series implements #3.
> > >
> > > Perhaps the easiest way to make forward progress is to rule out the
> > > options we absolutely *don't* want by focusing on the potentially
> > > blocking issue with each option:
> > >
> > >    #1 - SGX UAPI funkiness
> > >
> > >    #2 - Auditing complexity, potential enclave lock contention
> > >
> > >    #3 - Pushing SGX details into LSMs and complexity of kernel
> > > implementation
> > >
> > >
> > > [1]
> > > https://lkml.kernel.org/r/20190606021145.12604-1-sean.j.christopherson
> > > @intel.com
> > 
> > Given the complexity tradeoff, what is the clear motivating example for
> > why #1 isn't the obvious choice? That the enclave loader has no way of
> > knowing a priori whether the enclave will require W->X or WX?  But
> > aren't we better off requiring enclaves to be explicitly marked as
> > needing such so that we can make a more informed decision about whether
> > to load them in the first place?
> 
> Are you asking this question at a) page granularity, b) file granularity or
> c) enclave (potentially comprised of multiple executable files) granularity?
> 
> #b is what we have on regular executable files and shared objects (i.e.
> FILE__EXECMOD). We all know how to do that.
> 
> #c is kind of new but could be done via some proxy file (e.g. sigstruct file)
> hence reduced to #b.
> 
> #a is problematic. It'd require compilers/linkers to generate such
> information, and proper executable image file format to carry that
> information, to be eventually picked up the loader. SELinux doesn't have
> PAGE__EXECMOD I guess is because it is generally considered impractical.
> 
> Option #1 however requires #a because the driver doesn't track which page was
> loaded from which file, otherwise it can no longer be qualified "simple". Or
> we could just implement #c, which will make all options simpler. But I guess
> #b is still preferred, to be aligned with what SELinux is enforcing today on
> regular memory pages.o

Option #1 doesn't require (a).  The checks will happen for every page,
but in the RFCs I sent, the policies are still attached to files and
processes, i.e. (b).
Xing, Cedric June 14, 2019, 12:31 a.m. UTC | #18
> From: Christopherson, Sean J
> Sent: Thursday, June 13, 2019 4:18 PM
> 
> On Thu, Jun 13, 2019 at 04:03:24PM -0700, Xing, Cedric wrote:
> > > From: Stephen Smalley [mailto:sds@tycho.nsa.gov]
> > > Sent: Thursday, June 13, 2019 10:02 AM
> > >
> > > > My RFC series[1] implements #1.  My understanding is that Andy
> > > > (Lutomirski) prefers #2.  Cedric's RFC series implements #3.
> > > >
> > > > Perhaps the easiest way to make forward progress is to rule out
> the
> > > > options we absolutely *don't* want by focusing on the potentially
> > > > blocking issue with each option:
> > > >
> > > >    #1 - SGX UAPI funkiness
> > > >
> > > >    #2 - Auditing complexity, potential enclave lock contention
> > > >
> > > >    #3 - Pushing SGX details into LSMs and complexity of kernel
> > > > implementation
> > > >
> > > >
> > > > [1]
> > > > https://lkml.kernel.org/r/20190606021145.12604-1-
> sean.j.christopherson
> > > > @intel.com
> > >
> > > Given the complexity tradeoff, what is the clear motivating example
> for
> > > why #1 isn't the obvious choice? That the enclave loader has no way
> of
> > > knowing a priori whether the enclave will require W->X or WX?  But
> > > aren't we better off requiring enclaves to be explicitly marked as
> > > needing such so that we can make a more informed decision about
> whether
> > > to load them in the first place?
> >
> > Are you asking this question at a) page granularity, b) file
> granularity or
> > c) enclave (potentially comprised of multiple executable files)
> granularity?
> >
> > #b is what we have on regular executable files and shared objects (i.e.
> > FILE__EXECMOD). We all know how to do that.
> >
> > #c is kind of new but could be done via some proxy file (e.g.
> sigstruct file)
> > hence reduced to #b.
> >
> > #a is problematic. It'd require compilers/linkers to generate such
> > information, and proper executable image file format to carry that
> > information, to be eventually picked up the loader. SELinux doesn't
> have
> > PAGE__EXECMOD I guess is because it is generally considered
> impractical.
> >
> > Option #1 however requires #a because the driver doesn't track which
> page was
> > loaded from which file, otherwise it can no longer be qualified
> "simple". Or
> > we could just implement #c, which will make all options simpler. But I
> guess
> > #b is still preferred, to be aligned with what SELinux is enforcing
> today on
> > regular memory pages.o
> 
> Option #1 doesn't require (a).  The checks will happen for every page,
> but in the RFCs I sent, the policies are still attached to files and
> processes, i.e. (b).

I was talking at the UAPI level - i.e. your ioctl requires ALLOW_* at page granularity, hence #a.
Sean Christopherson June 14, 2019, 12:37 a.m. UTC | #19
On Thu, Jun 13, 2019 at 02:00:29PM -0400, Stephen Smalley wrote:
> On 6/11/19 6:55 PM, Xing, Cedric wrote:
> >*_noaudit() is exactly what I wanted. But I couldn't find
> >selinux_file_mprotect_noaudit()/file_has_perm_noaudit(), and I'm reluctant
> >to duplicate code. Any suggestions?
> 
> I would have no objection to adding _noaudit() variants of these, either
> duplicating code (if sufficiently small/simple) or creating a common helper
> with a bool audit flag that gets used for both. But the larger issue would
> be to resolve how to ultimately ensure that a denial is audited later if the
> denied permission is actually requested and blocked via sgxsec_mprotect().

I too would like to see a solution to the auditing issue.  I wrongly
assumed Cedric's approach (option #3) didn't suffer the same auditing
problem as Andy's dynamic tracking proposal (option #2).  After reading
through the code more carefully (trying to steal ideas to finish off an
implementation of #2), I've come to realize options #2 (Andy) and #3
(Cedric) are basically identical concepts, the only difference being who
tracks state.

We can use the f_security blob sizes to identify which LSM denied
something, but I haven't the faintest idea how to track the auditing
information in a sane fashion.  We'd basically have to do a deep copy on
struct common_audit_data, or pre-generate and store the audit message.
For SELinux, a deep copy is somewhat feasible because selinux_audit_data
distills everything down to basic types.  AppArmor on the other hand has
'struct aa_label *label', which at a glance all but requires pre-generating
the audit message, and since AppArmor logs denials from every profile, it's 
possible the "sgx audit blob" will consume a non-trivial amount of data.

Even if we figure out a way to store the audit messages without exploding
memory consumption or making things horrendously complex, we still have a
problem of reporting state info.  Any number of things could be removed or
modified by the time the audit is actually triggered, e.g. files removed,
AppArmor profiles modified, etc...  Any such change means we're logging
garbage.
Sean Christopherson June 14, 2019, 12:46 a.m. UTC | #20
On Thu, Jun 13, 2019 at 01:02:17PM -0400, Stephen Smalley wrote:
> On 6/11/19 6:02 PM, Sean Christopherson wrote:
> >On Tue, Jun 11, 2019 at 09:40:25AM -0400, Stephen Smalley wrote:
> >>I haven't looked at this code closely, but it feels like a lot of
> >>SGX-specific logic embedded into SELinux that will have to be repeated or
> >>reused for every security module.  Does SGX not track this state itself?
> >
> >SGX does track equivalent state.
> >
> >There are three proposals on the table (I think):
> >
> >   1. Require userspace to explicitly specificy (maximal) enclave page
> >      permissions at build time.  The enclave page permissions are provided
> >      to, and checked by, LSMs at enclave build time.
> >
> >      Pros: Low-complexity kernel implementation, straightforward auditing
> >      Cons: Sullies the SGX UAPI to some extent, may increase complexity of
> >            SGX2 enclave loaders.
> >
> >   2. Pre-check LSM permissions and dynamically track mappings to enclave
> >      pages, e.g. add an SGX mprotect() hook to restrict W->X and WX
> >      based on the pre-checked permissions.
> >
> >      Pros: Does not impact SGX UAPI, medium kernel complexity
> >      Cons: Auditing is complex/weird, requires taking enclave-specific
> >            lock during mprotect() to query/update tracking.
> >
> >   3. Implement LSM hooks in SGX to allow LSMs to track enclave regions
> >      from cradle to grave, but otherwise defer everything to LSMs.
> >
> >      Pros: Does not impact SGX UAPI, maximum flexibility, precise auditing
> >      Cons: Most complex and "heaviest" kernel implementation of the three,
> >            pushes more SGX details into LSMs.
> >
> >My RFC series[1] implements #1.  My understanding is that Andy (Lutomirski)
> >prefers #2.  Cedric's RFC series implements #3.
> >
> >Perhaps the easiest way to make forward progress is to rule out the
> >options we absolutely *don't* want by focusing on the potentially blocking
> >issue with each option:
> >
> >   #1 - SGX UAPI funkiness
> >
> >   #2 - Auditing complexity, potential enclave lock contention
> >
> >   #3 - Pushing SGX details into LSMs and complexity of kernel implementation
> >
> >
> >[1] https://lkml.kernel.org/r/20190606021145.12604-1-sean.j.christopherson@intel.com
> 
> Given the complexity tradeoff, what is the clear motivating example for why
> #1 isn't the obvious choice? That the enclave loader has no way of knowing a
> priori whether the enclave will require W->X or WX?  But aren't we better
> off requiring enclaves to be explicitly marked as needing such so that we
> can make a more informed decision about whether to load them in the first
> place?

Andy and/or Cedric, can you please weigh in with a concrete (and practical)
use case that will break if we go with #1?  The auditing issues for #2/#3
are complex to say the least...
Sean Christopherson June 14, 2019, 3:38 p.m. UTC | #21
On Thu, Jun 13, 2019 at 05:46:00PM -0700, Sean Christopherson wrote:
> On Thu, Jun 13, 2019 at 01:02:17PM -0400, Stephen Smalley wrote:
> > On 6/11/19 6:02 PM, Sean Christopherson wrote:
> > >On Tue, Jun 11, 2019 at 09:40:25AM -0400, Stephen Smalley wrote:
> > >>I haven't looked at this code closely, but it feels like a lot of
> > >>SGX-specific logic embedded into SELinux that will have to be repeated or
> > >>reused for every security module.  Does SGX not track this state itself?
> > >
> > >SGX does track equivalent state.
> > >
> > >There are three proposals on the table (I think):
> > >
> > >   1. Require userspace to explicitly specificy (maximal) enclave page
> > >      permissions at build time.  The enclave page permissions are provided
> > >      to, and checked by, LSMs at enclave build time.
> > >
> > >      Pros: Low-complexity kernel implementation, straightforward auditing
> > >      Cons: Sullies the SGX UAPI to some extent, may increase complexity of
> > >            SGX2 enclave loaders.
> > >
> > >   2. Pre-check LSM permissions and dynamically track mappings to enclave
> > >      pages, e.g. add an SGX mprotect() hook to restrict W->X and WX
> > >      based on the pre-checked permissions.
> > >
> > >      Pros: Does not impact SGX UAPI, medium kernel complexity
> > >      Cons: Auditing is complex/weird, requires taking enclave-specific
> > >            lock during mprotect() to query/update tracking.
> > >
> > >   3. Implement LSM hooks in SGX to allow LSMs to track enclave regions
> > >      from cradle to grave, but otherwise defer everything to LSMs.
> > >
> > >      Pros: Does not impact SGX UAPI, maximum flexibility, precise auditing
> > >      Cons: Most complex and "heaviest" kernel implementation of the three,
> > >            pushes more SGX details into LSMs.
> > >
> > >My RFC series[1] implements #1.  My understanding is that Andy (Lutomirski)
> > >prefers #2.  Cedric's RFC series implements #3.
> > >
> > >Perhaps the easiest way to make forward progress is to rule out the
> > >options we absolutely *don't* want by focusing on the potentially blocking
> > >issue with each option:
> > >
> > >   #1 - SGX UAPI funkiness
> > >
> > >   #2 - Auditing complexity, potential enclave lock contention
> > >
> > >   #3 - Pushing SGX details into LSMs and complexity of kernel implementation
> > >
> > >
> > >[1] https://lkml.kernel.org/r/20190606021145.12604-1-sean.j.christopherson@intel.com
> > 
> > Given the complexity tradeoff, what is the clear motivating example for why
> > #1 isn't the obvious choice? That the enclave loader has no way of knowing a
> > priori whether the enclave will require W->X or WX?  But aren't we better
> > off requiring enclaves to be explicitly marked as needing such so that we
> > can make a more informed decision about whether to load them in the first
> > place?
> 
> Andy and/or Cedric, can you please weigh in with a concrete (and practical)
> use case that will break if we go with #1?  The auditing issues for #2/#3
> are complex to say the least...

Follow-up question, is #1 any more palatable if SELinux adds SGX specific
permissions and ties them to the process (instead of the vma or sigstruct)?

Something like this for SELinux, where the absolute worst case scenario is
that SGX2 enclave loaders need SGXEXECMEM.  Graphene would need SGXEXECUNMR
and probably SGXEXECANON.

static inline int sgx_has_perm(u32 sid, u32 requested)
{
        return avc_has_perm(&selinux_state, sid, sid,
			    SECCLASS_PROCESS2, requested, NULL);
}

static int selinux_enclave_load(struct vm_area_struct *vma, unsigned long prot,
				bool measured)
{
	const struct cred *cred = current_cred();
	u32 sid = cred_sid(cred);
	int ret;

	/* SGX is supported only in 64-bit kernels. */
	WARN_ON_ONCE(!default_noexec);

	/* Only executable enclave pages are restricted in any way. */
	if (!(prot & PROT_EXEC))
		return 0;

	/*
	 * Private mappings to enclave pages are impossible, ergo we don't
	 * differentiate between W->X and WX, either case requires EXECMEM.
	 */
	if (prot & PROT_WRITE) {
		ret = sgx_has_perm(sid, PROCESS2__SGXEXECMEM);
		if (ret)
			goto out;
	}
	if (!measured) {
		ret = sgx_has_perm(sid, PROCESS2__SGXEXECUNMR);
		if (ret)
			goto out;
	}

	if (!vma->vm_file || !IS_PRIVATE(file_inode(vma->vm_file)) ||
	    vma->anon_vma) {
		/*
		 * Loading enclave code from an anonymous mapping or from a
		 * modified private file mapping.
		 */
		ret = sgx_has_perm(sid, PROCESS2__SGXEXECANON);
		if (ret)
			goto out;
	} else {
		/* Loading from a shared or unmodified private file mapping. */
		ret = sgx_has_perm(sid, PROCESS2__SGXEXECFILE);
		if (ret)
			goto out;

		/* The source file must be executable in this case. */
		ret = file_has_perm(cred, vma->vm_file, FILE__EXECUTE);
		if (ret)
			goto out;
	}

out:
	return ret;
}


Given that AppArmor generally only cares about accessing files, its
enclave_load() implementation would be something like:

static int apparmor_enclave_load(struct vm_area_struct *vma, unsigned long prot,
				bool measured)
{
	if (!(prot & PROT_EXEC))
		return 0;

	return common_file_perm(OP_ENCL_LOAD, vma->vm_file, AA_EXEC_MMAP);
}
Xing, Cedric June 14, 2019, 5:16 p.m. UTC | #22
> From: Christopherson, Sean J
> Sent: Thursday, June 13, 2019 5:46 PM
> 
> On Thu, Jun 13, 2019 at 01:02:17PM -0400, Stephen Smalley wrote:
> > On 6/11/19 6:02 PM, Sean Christopherson wrote:
> > >On Tue, Jun 11, 2019 at 09:40:25AM -0400, Stephen Smalley wrote:
> > >>I haven't looked at this code closely, but it feels like a lot of
> > >>SGX-specific logic embedded into SELinux that will have to be
> > >>repeated or reused for every security module.  Does SGX not track
> this state itself?
> > >
> > >SGX does track equivalent state.
> > >
> > >There are three proposals on the table (I think):
> > >
> > >   1. Require userspace to explicitly specificy (maximal) enclave
> page
> > >      permissions at build time.  The enclave page permissions are
> provided
> > >      to, and checked by, LSMs at enclave build time.
> > >
> > >      Pros: Low-complexity kernel implementation, straightforward
> auditing
> > >      Cons: Sullies the SGX UAPI to some extent, may increase
> complexity of
> > >            SGX2 enclave loaders.
> > >
> > >   2. Pre-check LSM permissions and dynamically track mappings to
> enclave
> > >      pages, e.g. add an SGX mprotect() hook to restrict W->X and WX
> > >      based on the pre-checked permissions.
> > >
> > >      Pros: Does not impact SGX UAPI, medium kernel complexity
> > >      Cons: Auditing is complex/weird, requires taking enclave-
> specific
> > >            lock during mprotect() to query/update tracking.
> > >
> > >   3. Implement LSM hooks in SGX to allow LSMs to track enclave
> regions
> > >      from cradle to grave, but otherwise defer everything to LSMs.
> > >
> > >      Pros: Does not impact SGX UAPI, maximum flexibility, precise
> auditing
> > >      Cons: Most complex and "heaviest" kernel implementation of the
> three,
> > >            pushes more SGX details into LSMs.
> > >
> > >My RFC series[1] implements #1.  My understanding is that Andy
> > >(Lutomirski) prefers #2.  Cedric's RFC series implements #3.
> > >
> > >Perhaps the easiest way to make forward progress is to rule out the
> > >options we absolutely *don't* want by focusing on the potentially
> > >blocking issue with each option:
> > >
> > >   #1 - SGX UAPI funkiness
> > >
> > >   #2 - Auditing complexity, potential enclave lock contention
> > >
> > >   #3 - Pushing SGX details into LSMs and complexity of kernel
> > > implementation
> > >
> > >
> > >[1]
> > >https://lkml.kernel.org/r/20190606021145.12604-1-sean.j.christopherso
> > >n@intel.com
> >
> > Given the complexity tradeoff, what is the clear motivating example
> > for why
> > #1 isn't the obvious choice? That the enclave loader has no way of
> > knowing a priori whether the enclave will require W->X or WX?  But
> > aren't we better off requiring enclaves to be explicitly marked as
> > needing such so that we can make a more informed decision about
> > whether to load them in the first place?
> 
> Andy and/or Cedric, can you please weigh in with a concrete (and
> practical) use case that will break if we go with #1?  The auditing
> issues for #2/#3 are complex to say the least...

How does enclave loader provide per-page ALLOW_* flags? And a related question is why they are necessary for enclaves but unnecessary for regular executables or shared objects.

What's the story for SGX2 if mmap()'ing non-existing pages is not allowed?

What's the story for auditing?

After everything above has been taken care of properly, will #1 still be simpler than #2/#3?
Sean Christopherson June 14, 2019, 5:45 p.m. UTC | #23
On Fri, Jun 14, 2019 at 10:16:55AM -0700, Xing, Cedric wrote:
> > From: Christopherson, Sean J
> > Sent: Thursday, June 13, 2019 5:46 PM
> > 
> > On Thu, Jun 13, 2019 at 01:02:17PM -0400, Stephen Smalley wrote:
> > > On 6/11/19 6:02 PM, Sean Christopherson wrote:
> > > >My RFC series[1] implements #1.  My understanding is that Andy
> > > >(Lutomirski) prefers #2.  Cedric's RFC series implements #3.
> > > >
> > > >Perhaps the easiest way to make forward progress is to rule out the
> > > >options we absolutely *don't* want by focusing on the potentially
> > > >blocking issue with each option:
> > > >
> > > >   #1 - SGX UAPI funkiness
> > > >
> > > >   #2 - Auditing complexity, potential enclave lock contention
> > > >
> > > >   #3 - Pushing SGX details into LSMs and complexity of kernel
> > > > implementation
> > > >
> > > >
> > > >[1]
> > > >https://lkml.kernel.org/r/20190606021145.12604-1-sean.j.christopherso
> > > >n@intel.com
> > >
> > > Given the complexity tradeoff, what is the clear motivating example
> > > for why
> > > #1 isn't the obvious choice? That the enclave loader has no way of
> > > knowing a priori whether the enclave will require W->X or WX?  But
> > > aren't we better off requiring enclaves to be explicitly marked as
> > > needing such so that we can make a more informed decision about
> > > whether to load them in the first place?
> > 
> > Andy and/or Cedric, can you please weigh in with a concrete (and
> > practical) use case that will break if we go with #1?  The auditing
> > issues for #2/#3 are complex to say the least...
> 
> How does enclave loader provide per-page ALLOW_* flags?

Unchanged from my RFC, i.e. specified at SGX_IOC_ENCLAVE_ADD_PAGE(S).

> And a related question is why they are necessary for enclaves but
> unnecessary for regular executables or shared objects.

Because at mmap()/mprotect() time we don't have the source file of the
enclave page to check SELinux's FILE__EXECUTE or AppArmor's AA_EXEC_MMAP.

> What's the story for SGX2 if mmap()'ing non-existing pages is not allowed?

Userspace will need to invoke an ioctl() to tell SGX "this range can be
EAUG'd".

> 
> What's the story for auditing?

It happens naturally when security_enclave_load() is called.  Am I
missing something?

> After everything above has been taken care of properly, will #1 still be
> simpler than #2/#3?

The state tracking of #2/#3 doesn't scare me, it's purely the auditing.
Holding an audit message for an indeterminate amount of time is a
nightmare.

Here's a thought.  What if we simply require FILE__EXECUTE or AA_EXEC_MAP
to load any enclave page from a file?  Alternatively, we could add an SGX
specific file policity, e.g. FILE__ENCLAVELOAD and AA_MAY_LOAD_ENCLAVE.
As in my other email, SELinux's W^X restrictions can be tied to the process,
i.e. they can be checked at mmap()/mprotect() without throwing a wrench in
auditing.
Sean Christopherson June 14, 2019, 5:53 p.m. UTC | #24
On Fri, Jun 14, 2019 at 10:45:56AM -0700, Sean Christopherson wrote:
> The state tracking of #2/#3 doesn't scare me, it's purely the auditing.
> Holding an audit message for an indeterminate amount of time is a
> nightmare.
> 
> Here's a thought.  What if we simply require FILE__EXECUTE or AA_EXEC_MAP
> to load any enclave page from a file?  Alternatively, we could add an SGX
> specific file policity, e.g. FILE__ENCLAVELOAD and AA_MAY_LOAD_ENCLAVE.
> As in my other email, SELinux's W^X restrictions can be tied to the process,
> i.e. they can be checked at mmap()/mprotect() without throwing a wrench in
> auditing.

We would also need to require VM_MAYEXEC on all enclave pages, or forego
enforcing path_noexec() for enclaves.
Sean Christopherson June 14, 2019, 8:01 p.m. UTC | #25
On Fri, Jun 14, 2019 at 10:53:39AM -0700, Sean Christopherson wrote:
> On Fri, Jun 14, 2019 at 10:45:56AM -0700, Sean Christopherson wrote:
> > The state tracking of #2/#3 doesn't scare me, it's purely the auditing.
> > Holding an audit message for an indeterminate amount of time is a
> > nightmare.
> > 
> > Here's a thought.  What if we simply require FILE__EXECUTE or AA_EXEC_MAP
> > to load any enclave page from a file?  Alternatively, we could add an SGX
> > specific file policity, e.g. FILE__ENCLAVELOAD and AA_MAY_LOAD_ENCLAVE.
> > As in my other email, SELinux's W^X restrictions can be tied to the process,
> > i.e. they can be checked at mmap()/mprotect() without throwing a wrench in
> > auditing.
> 
> We would also need to require VM_MAYEXEC on all enclave pages, or forego
> enforcing path_noexec() for enclaves.

Scratch that thought.   Tying W^X restrictions to the process only works
if its done at load time.  E.g. If process A maps a page W and process B
maps the same page X, then which process needs W^X depends on the order of
mmap()/mprotect() between the two processes.
Dr. Greg June 14, 2019, 11:19 p.m. UTC | #26
On Thu, Jun 13, 2019 at 05:46:00PM -0700, Sean Christopherson wrote:

Good afternoon, I hope the week is ending well for everyone.

> On Thu, Jun 13, 2019 at 01:02:17PM -0400, Stephen Smalley wrote:
> > Given the complexity tradeoff, what is the clear motivating
> > example for why #1 isn't the obvious choice? That the enclave
> > loader has no way of knowing a priori whether the enclave will
> > require W->X or WX?  But aren't we better off requiring enclaves
> > to be explicitly marked as needing such so that we can make a more
> > informed decision about whether to load them in the first place?

> Andy and/or Cedric, can you please weigh in with a concrete (and
> practical) use case that will break if we go with #1?  The auditing
> issues for #2/#3 are complex to say the least...

So we are back to choosing door 1, door 2 or door 3.

That brings us back to our previous e-mail, where we suggested that
the most fundamental question to answer with the LSM issue is how much
effective security is being purchased at what complexity cost.

We are practical guys at our company, we direct the development and
deployment of practical SGX systems, including an independent
implementation of SGX runtime/attestation/provisioning et.al.  Our
comments, for whatever they are worth, are meant to reflect the real
world deployment of this technology.

Lets start big picture.

One of the clients we are consulting with on this technology is
running well north of 1400 Linux systems.  Every one of which has
selinux=0 in /proc/cmdline and will do so until approximately the heat
death of the Universe.

Our AI LSM will use any SGX LSM driver hooks that eventuate from these
discussions, so we support the notion of the LSM getting a look at
permissions of executable code.  However, our client isn't unique in
their configuration choice, so we believe this fact calls the question
as to how much SGX specific complexity should be injected into the
LSM.

So, as we noted in our previous e-mail, there are only two relevant
security questions the LSM needs to answer:

1.) Should a page of memory with executable content be allowed into an
enclave?

2.) Should an enclave be allowed to possess one or more pages of
executable memory which will have WX permissions sometime during its
lifetime?

Sean is suggesting the strategy of an ioctl to call out pages that
conform to question 2 (EAUG'ed pages).  That doesn't seem like an
onerous requirement, since all of the current enclave loaders already
have all of the metadata infrastructure to map/load page ranges.  The
EAUG WX range would simply be another layout type that gets walked
over when the enclave image is built.

Given that, we were somewhat surprised to hear Sean say that he had
been advised that door 1 was a non-starter.  Presumably this was
because of the need to delineate a specific cohort of pages that will
be permitted WX.  If that is the case, the question that needs to be
called, as Stephen alludes to above, is whether or not WX privileges
should be considered a characterizing feature of the VMA that defines
an enclave rather then a per page attribute.

Do we realistically believe that an LSM will be implemented that
reacts differently when the 357th page of WX memory is added as
opposed to the first?  The operative security question is whether or
not the platform owner is willing to allow arbitrary executable code,
that they may have no visibility into, to be executed on their
platform.

We talk to people that, as a technology, SGX is about building
'security archipelagos', islands of trusted execution on potentially
multiple platforms that interact to deliver a service, all of which
consider their surrounding platforms and the network in between them
as adversarial.  This model is, by definition, adverserial to the
notion and function of the LSM.

With respect to SGX dynamic code loading, the future for security
concious architectures, will be to pull the code from remotely
attested repository servers over the network.  The only relevant
security question that can be answered is whether or not a platform
owner feels comfortable with that model.

Best wishes for a pleasant weekend to everyone.

Dr. Greg

As always,
Dr. G.W. Wettstein, Ph.D.   Enjellic Systems Development, LLC.
4206 N. 19th Ave.           Specializing in information infra-structure
Fargo, ND  58102            development.
PH: 701-281-1686
FAX: 701-281-3949           EMAIL: greg@enjellic.com
------------------------------------------------------------------------------
"My spin on the meeting?  I lie somewhere between the individual who
 feels that we are all going to join hands and march forward carrying
 the organization into the information age and Dr. Wettstein.  Who
 feels that they are holding secret meetings at 6 o'clock in the
 morning plotting strategy on how to replace our system."
                                -- Paul S. Etzell, M.D.
                                   Medical Director, Roger Maris Cancer Center
Andy Lutomirski June 16, 2019, 10:14 p.m. UTC | #27
On Fri, Jun 14, 2019 at 8:38 AM Sean Christopherson
<sean.j.christopherson@intel.com> wrote:
>
> On Thu, Jun 13, 2019 at 05:46:00PM -0700, Sean Christopherson wrote:
> > On Thu, Jun 13, 2019 at 01:02:17PM -0400, Stephen Smalley wrote:
> > > On 6/11/19 6:02 PM, Sean Christopherson wrote:
> > > >On Tue, Jun 11, 2019 at 09:40:25AM -0400, Stephen Smalley wrote:
> > > >>I haven't looked at this code closely, but it feels like a lot of
> > > >>SGX-specific logic embedded into SELinux that will have to be repeated or
> > > >>reused for every security module.  Does SGX not track this state itself?
> > > >
> > > >SGX does track equivalent state.
> > > >
> > > >There are three proposals on the table (I think):
> > > >
> > > >   1. Require userspace to explicitly specificy (maximal) enclave page
> > > >      permissions at build time.  The enclave page permissions are provided
> > > >      to, and checked by, LSMs at enclave build time.
> > > >
> > > >      Pros: Low-complexity kernel implementation, straightforward auditing
> > > >      Cons: Sullies the SGX UAPI to some extent, may increase complexity of
> > > >            SGX2 enclave loaders.
> > > >
> > > >   2. Pre-check LSM permissions and dynamically track mappings to enclave
> > > >      pages, e.g. add an SGX mprotect() hook to restrict W->X and WX
> > > >      based on the pre-checked permissions.
> > > >
> > > >      Pros: Does not impact SGX UAPI, medium kernel complexity
> > > >      Cons: Auditing is complex/weird, requires taking enclave-specific
> > > >            lock during mprotect() to query/update tracking.
> > > >
> > > >   3. Implement LSM hooks in SGX to allow LSMs to track enclave regions
> > > >      from cradle to grave, but otherwise defer everything to LSMs.
> > > >
> > > >      Pros: Does not impact SGX UAPI, maximum flexibility, precise auditing
> > > >      Cons: Most complex and "heaviest" kernel implementation of the three,
> > > >            pushes more SGX details into LSMs.
> > > >
> > > >My RFC series[1] implements #1.  My understanding is that Andy (Lutomirski)
> > > >prefers #2.  Cedric's RFC series implements #3.
> > > >
> > > >Perhaps the easiest way to make forward progress is to rule out the
> > > >options we absolutely *don't* want by focusing on the potentially blocking
> > > >issue with each option:
> > > >
> > > >   #1 - SGX UAPI funkiness
> > > >
> > > >   #2 - Auditing complexity, potential enclave lock contention
> > > >
> > > >   #3 - Pushing SGX details into LSMs and complexity of kernel implementation
> > > >
> > > >
> > > >[1] https://lkml.kernel.org/r/20190606021145.12604-1-sean.j.christopherson@intel.com
> > >
> > > Given the complexity tradeoff, what is the clear motivating example for why
> > > #1 isn't the obvious choice? That the enclave loader has no way of knowing a
> > > priori whether the enclave will require W->X or WX?  But aren't we better
> > > off requiring enclaves to be explicitly marked as needing such so that we
> > > can make a more informed decision about whether to load them in the first
> > > place?
> >
> > Andy and/or Cedric, can you please weigh in with a concrete (and practical)
> > use case that will break if we go with #1?  The auditing issues for #2/#3
> > are complex to say the least...

The most significant issue I see is the following.  Consider two
cases. First, an SGX2 enclave that dynamically allocates memory but
doesn't execute code from dynamic memory.  Second, an SGX2 enclave
that *does* execute code from dynamic memory.  In #1, the untrusted
stack needs to decide whether to ALLOW_EXEC when the memory is
allocated, which means that it either needs to assume the worst or it
needs to know at allocation time whether the enclave ever intends to
change the permission to X.

I suppose there's a middle ground.  The driver could use model #1 for
driver-filled pages and model #2 for dynamic pages.  I haven't tried
to fully work it out, but I think there would be the ALLOW_READ /
ALLOW_WRITE / ALLOW_EXEC flag for EADD-ed pages but, for EAUG-ed
pages, there would be a different policy.  This might be as simple as
internally having four flags instead of three:

ALLOW_READ, ALLOW_WRITE, ALLOW_EXEC: as before

ALLOW_EXEC_COND: set implicitly by the driver for EAUG.

As in #1, if you try to mmap or protect a page with neither ALLOW_EXEC
variant, it fails (-EACCES, perhaps).  But, if you try to mmap or
mprotect an ALLOW_EXEC_COND page with PROT_EXEC, you ask LSM for
permission.  There is no fancy DIRTY tracking here, since it's
reasonable to just act as though *every* ALLOW_EXEC_COND page is
dirty.  There is no real auditing issue here, since LSM can just log
what permission is missing.

Does this seem sensible?  It might give us the best of #1 and #2.

>
> Follow-up question, is #1 any more palatable if SELinux adds SGX specific
> permissions and ties them to the process (instead of the vma or sigstruct)?

I'm not sure this makes a difference.  It simplifies SIGSTRUCT
handling, which is handy.
Andy Lutomirski June 16, 2019, 10:16 p.m. UTC | #28
On Fri, Jun 14, 2019 at 10:16 AM Xing, Cedric <cedric.xing@intel.com> wrote:
>
> > From: Christopherson, Sean J
> > Sent: Thursday, June 13, 2019 5:46 PM
> >
> > On Thu, Jun 13, 2019 at 01:02:17PM -0400, Stephen Smalley wrote:
> > > On 6/11/19 6:02 PM, Sean Christopherson wrote:
> > > >On Tue, Jun 11, 2019 at 09:40:25AM -0400, Stephen Smalley wrote:
> > > >>I haven't looked at this code closely, but it feels like a lot of
> > > >>SGX-specific logic embedded into SELinux that will have to be
> > > >>repeated or reused for every security module.  Does SGX not track
> > this state itself?
> > > >
> > > >SGX does track equivalent state.
> > > >
> > > >There are three proposals on the table (I think):
> > > >
> > > >   1. Require userspace to explicitly specificy (maximal) enclave
> > page
> > > >      permissions at build time.  The enclave page permissions are
> > provided
> > > >      to, and checked by, LSMs at enclave build time.
> > > >
> > > >      Pros: Low-complexity kernel implementation, straightforward
> > auditing
> > > >      Cons: Sullies the SGX UAPI to some extent, may increase
> > complexity of
> > > >            SGX2 enclave loaders.
> > > >
> > > >   2. Pre-check LSM permissions and dynamically track mappings to
> > enclave
> > > >      pages, e.g. add an SGX mprotect() hook to restrict W->X and WX
> > > >      based on the pre-checked permissions.
> > > >
> > > >      Pros: Does not impact SGX UAPI, medium kernel complexity
> > > >      Cons: Auditing is complex/weird, requires taking enclave-
> > specific
> > > >            lock during mprotect() to query/update tracking.
> > > >
> > > >   3. Implement LSM hooks in SGX to allow LSMs to track enclave
> > regions
> > > >      from cradle to grave, but otherwise defer everything to LSMs.
> > > >
> > > >      Pros: Does not impact SGX UAPI, maximum flexibility, precise
> > auditing
> > > >      Cons: Most complex and "heaviest" kernel implementation of the
> > three,
> > > >            pushes more SGX details into LSMs.
> > > >
> > > >My RFC series[1] implements #1.  My understanding is that Andy
> > > >(Lutomirski) prefers #2.  Cedric's RFC series implements #3.
> > > >
> > > >Perhaps the easiest way to make forward progress is to rule out the
> > > >options we absolutely *don't* want by focusing on the potentially
> > > >blocking issue with each option:
> > > >
> > > >   #1 - SGX UAPI funkiness
> > > >
> > > >   #2 - Auditing complexity, potential enclave lock contention
> > > >
> > > >   #3 - Pushing SGX details into LSMs and complexity of kernel
> > > > implementation
> > > >
> > > >
> > > >[1]
> > > >https://lkml.kernel.org/r/20190606021145.12604-1-sean.j.christopherso
> > > >n@intel.com
> > >
> > > Given the complexity tradeoff, what is the clear motivating example
> > > for why
> > > #1 isn't the obvious choice? That the enclave loader has no way of
> > > knowing a priori whether the enclave will require W->X or WX?  But
> > > aren't we better off requiring enclaves to be explicitly marked as
> > > needing such so that we can make a more informed decision about
> > > whether to load them in the first place?
> >
> > Andy and/or Cedric, can you please weigh in with a concrete (and
> > practical) use case that will break if we go with #1?  The auditing
> > issues for #2/#3 are complex to say the least...
>
> How does enclave loader provide per-page ALLOW_* flags? And a related question is why they are necessary for enclaves but unnecessary for regular executables or shared objects.
>
> What's the story for SGX2 if mmap()'ing non-existing pages is not allowed?
>

I think it just works.  Either you can't mmap() the page until you
have explicitly EAUG-ed it, or you add a new ioctl() that is
effectively "EAUG lazily".  The latter would declare that address and
request that it get allocated and EAUGed when faulted, but it wouldn't
actually do the EAUG.

--Andy
Sean Christopherson June 17, 2019, 4:49 p.m. UTC | #29
On Sun, Jun 16, 2019 at 03:14:51PM -0700, Andy Lutomirski wrote:
> On Fri, Jun 14, 2019 at 8:38 AM Sean Christopherson
> <sean.j.christopherson@intel.com> wrote:
> > > Andy and/or Cedric, can you please weigh in with a concrete (and practical)
> > > use case that will break if we go with #1?  The auditing issues for #2/#3
> > > are complex to say the least...
> 
> The most significant issue I see is the following.  Consider two
> cases. First, an SGX2 enclave that dynamically allocates memory but
> doesn't execute code from dynamic memory.  Second, an SGX2 enclave
> that *does* execute code from dynamic memory.  In #1, the untrusted
> stack needs to decide whether to ALLOW_EXEC when the memory is
> allocated, which means that it either needs to assume the worst or it
> needs to know at allocation time whether the enclave ever intends to
> change the permission to X.

I'm just not convinced that folks running enclaves that can't communicate
their basic functionality will care one whit about SELinux restrictions,
i.e. will happily give EXECMOD even if it's not strictly necessary.
 
> I suppose there's a middle ground.  The driver could use model #1 for
> driver-filled pages and model #2 for dynamic pages.  I haven't tried
> to fully work it out, but I think there would be the ALLOW_READ /
> ALLOW_WRITE / ALLOW_EXEC flag for EADD-ed pages but, for EAUG-ed
> pages, there would be a different policy.  This might be as simple as
> internally having four flags instead of three:
> 
> ALLOW_READ, ALLOW_WRITE, ALLOW_EXEC: as before
> 
> ALLOW_EXEC_COND: set implicitly by the driver for EAUG.
> 
> As in #1, if you try to mmap or protect a page with neither ALLOW_EXEC
> variant, it fails (-EACCES, perhaps).  But, if you try to mmap or
> mprotect an ALLOW_EXEC_COND page with PROT_EXEC, you ask LSM for
> permission.  There is no fancy DIRTY tracking here, since it's
> reasonable to just act as though *every* ALLOW_EXEC_COND page is
> dirty.  There is no real auditing issue here, since LSM can just log
> what permission is missing.
> 
> Does this seem sensible?  It might give us the best of #1 and #2.

It would work and is easy to implement *if* SELinux ties permissions to
the process, as the SIGSTRUCT vma/file won't be available at
EAUG+mprotect().  I already have a set of patches to that effect, I'll
send 'em out in a bit.

FWIW, we still need to differentiate W->X from WX on SGX1, i.e. declaring
ALLOW_WRITE + ALLOW_EXEC shouldn't imply WX.  This is also addressed in
the forthcoming updated RFC.

> > Follow-up question, is #1 any more palatable if SELinux adds SGX specific
> > permissions and ties them to the process (instead of the vma or sigstruct)?
> 
> I'm not sure this makes a difference.  It simplifies SIGSTRUCT
> handling, which is handy.
Andy Lutomirski June 17, 2019, 5:08 p.m. UTC | #30
On Mon, Jun 17, 2019 at 9:49 AM Sean Christopherson
<sean.j.christopherson@intel.com> wrote:
>
> On Sun, Jun 16, 2019 at 03:14:51PM -0700, Andy Lutomirski wrote:
> > On Fri, Jun 14, 2019 at 8:38 AM Sean Christopherson
> > <sean.j.christopherson@intel.com> wrote:
> > > > Andy and/or Cedric, can you please weigh in with a concrete (and practical)
> > > > use case that will break if we go with #1?  The auditing issues for #2/#3
> > > > are complex to say the least...
> >
> > The most significant issue I see is the following.  Consider two
> > cases. First, an SGX2 enclave that dynamically allocates memory but
> > doesn't execute code from dynamic memory.  Second, an SGX2 enclave
> > that *does* execute code from dynamic memory.  In #1, the untrusted
> > stack needs to decide whether to ALLOW_EXEC when the memory is
> > allocated, which means that it either needs to assume the worst or it
> > needs to know at allocation time whether the enclave ever intends to
> > change the permission to X.
>
> I'm just not convinced that folks running enclaves that can't communicate
> their basic functionality will care one whit about SELinux restrictions,
> i.e. will happily give EXECMOD even if it's not strictly necessary.

At least when permissions are learned, if there's no ALLOW_EXEC for
EAUG, then EXECMOD won't get learned if there's no eventual attempt to
execute the memory.

>
> > I suppose there's a middle ground.  The driver could use model #1 for
> > driver-filled pages and model #2 for dynamic pages.  I haven't tried
> > to fully work it out, but I think there would be the ALLOW_READ /
> > ALLOW_WRITE / ALLOW_EXEC flag for EADD-ed pages but, for EAUG-ed
> > pages, there would be a different policy.  This might be as simple as
> > internally having four flags instead of three:
> >
> > ALLOW_READ, ALLOW_WRITE, ALLOW_EXEC: as before
> >
> > ALLOW_EXEC_COND: set implicitly by the driver for EAUG.
> >
> > As in #1, if you try to mmap or protect a page with neither ALLOW_EXEC
> > variant, it fails (-EACCES, perhaps).  But, if you try to mmap or
> > mprotect an ALLOW_EXEC_COND page with PROT_EXEC, you ask LSM for
> > permission.  There is no fancy DIRTY tracking here, since it's
> > reasonable to just act as though *every* ALLOW_EXEC_COND page is
> > dirty.  There is no real auditing issue here, since LSM can just log
> > what permission is missing.
> >
> > Does this seem sensible?  It might give us the best of #1 and #2.
>
> It would work and is easy to implement *if* SELinux ties permissions to
> the process, as the SIGSTRUCT vma/file won't be available at
> EAUG+mprotect().  I already have a set of patches to that effect, I'll
> send 'em out in a bit.

I'm okay with that.

>
> FWIW, we still need to differentiate W->X from WX on SGX1, i.e. declaring
> ALLOW_WRITE + ALLOW_EXEC shouldn't imply WX.  This is also addressed in
> the forthcoming updated RFC.

Sounds good.
Dr. Greg June 18, 2019, 3:40 p.m. UTC | #31
On Mon, Jun 17, 2019 at 09:49:15AM -0700, Sean Christopherson wrote:

Good morning to everyone.

> On Sun, Jun 16, 2019 at 03:14:51PM -0700, Andy Lutomirski wrote:
> > The most significant issue I see is the following.  Consider two
> > cases. First, an SGX2 enclave that dynamically allocates memory but
> > doesn't execute code from dynamic memory.  Second, an SGX2 enclave
> > that *does* execute code from dynamic memory.  In #1, the untrusted
> > stack needs to decide whether to ALLOW_EXEC when the memory is
> > allocated, which means that it either needs to assume the worst or it
> > needs to know at allocation time whether the enclave ever intends to
> > change the permission to X.

> I'm just not convinced that folks running enclaves that can't
> communicate their basic functionality will care one whit about
> SELinux restrictions, i.e. will happily give EXECMOD even if it's
> not strictly necessary.

Hence the comments in my mail from last Friday.

It seems to us that the path forward is to require the enclave
author/signer to express their intent to implement executable dynamic
memory, see below.

> > I suppose there's a middle ground.  The driver could use model #1 for
> > driver-filled pages and model #2 for dynamic pages.  I haven't tried
> > to fully work it out, but I think there would be the ALLOW_READ /
> > ALLOW_WRITE / ALLOW_EXEC flag for EADD-ed pages but, for EAUG-ed
> > pages, there would be a different policy.  This might be as simple as
> > internally having four flags instead of three:
> > 
> > ALLOW_READ, ALLOW_WRITE, ALLOW_EXEC: as before
> > 
> > ALLOW_EXEC_COND: set implicitly by the driver for EAUG.
> > 
> > As in #1, if you try to mmap or protect a page with neither ALLOW_EXEC
> > variant, it fails (-EACCES, perhaps).  But, if you try to mmap or
> > mprotect an ALLOW_EXEC_COND page with PROT_EXEC, you ask LSM for
> > permission.  There is no fancy DIRTY tracking here, since it's
> > reasonable to just act as though *every* ALLOW_EXEC_COND page is
> > dirty.  There is no real auditing issue here, since LSM can just log
> > what permission is missing.
> > 
> > Does this seem sensible?  It might give us the best of #1 and #2.

> It would work and is easy to implement *if* SELinux ties permissions
> to the process, as the SIGSTRUCT vma/file won't be available at
> EAUG+mprotect().  I already have a set of patches to that effect,
> I'll send 'em out in a bit.

The VMA that is crafted from the enclave file is going to exist for
the life of the enclave.  If the intent to use executable dynamic
memory is specified when the enclave image is being built, or as a
component of enclave initialization, the driver is in a position to
log/deny a request to EAUG+mprotect whenever it occurs.  The sensitive
criteria would seem to be any request for dynamically allocated memory
with executable status.

The potential security impact of dynamically executable content is
something that is dependent on the enclave author rather then the
context of execution that is requesting pages to be allocated for such
purposes.  There is going to be an LSM hook to evaluate the SIGSTRUCT
at the time of EINIT, so all of the necessary information is there to
make a decision on whether or not to flag the VMA as being allowed to
support dynamically executable content.

It doesn't seem like an onerous requirement for this information to be
specified in the enclave metadata.  For optimum security, one could
perhaps argue that the ability to implement dynamic memory should have
been a specifiable attribute of the enclave, similar to the debug,
launch and provisioning attributes.

As we have indicated in the past, once the enclave is initialized with
permissions for dynamically executable content, the platform is
completely dependent on the security intentions of the author of the
enclave.  Given that, the notion of enduring significant LSM
complexity does not seem to be justified.

Which opens up another set of security implications to discuss but we will let
those lie for the moment.

Have a good day.

Dr. Greg

As always,
Dr. G.W. Wettstein, Ph.D.   Enjellic Systems Development, LLC.
4206 N. 19th Ave.           Specializing in information infra-structure
Fargo, ND  58102            development.
PH: 701-281-1686            EMAIL: greg@enjellic.com
------------------------------------------------------------------------------
"More people are killed every year by pigs than by sharks, which shows
 you how good we are at evaluating risk."
                                -- Bruce Schneier
                                   Beyond Fear
diff mbox series

Patch

diff --git a/security/selinux/Makefile b/security/selinux/Makefile
index ccf950409384..58a05a9639e0 100644
--- a/security/selinux/Makefile
+++ b/security/selinux/Makefile
@@ -14,6 +14,8 @@  selinux-$(CONFIG_SECURITY_NETWORK_XFRM) += xfrm.o
 
 selinux-$(CONFIG_NETLABEL) += netlabel.o
 
+selinux-$(CONFIG_INTEL_SGX) += intel_sgx.o
+
 ccflags-y := -I$(srctree)/security/selinux -I$(srctree)/security/selinux/include
 
 $(addprefix $(obj)/,$(selinux-y)): $(obj)/flask.h
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 3ec702cf46ca..17f855871a41 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -103,6 +103,7 @@ 
 #include "netlabel.h"
 #include "audit.h"
 #include "avc_ss.h"
+#include "intel_sgx.h"
 
 struct selinux_state selinux_state;
 
@@ -3485,6 +3486,11 @@  static int selinux_file_alloc_security(struct file *file)
 	return file_alloc_security(file);
 }
 
+static void selinux_file_free_security(struct file *file)
+{
+	sgxsec_enclave_free(file);
+}
+
 /*
  * Check whether a task has the ioctl permission and cmd
  * operation to an inode.
@@ -3656,6 +3662,7 @@  static int selinux_file_mprotect(struct vm_area_struct *vma,
 				 unsigned long reqprot,
 				 unsigned long prot)
 {
+	int rc;
 	const struct cred *cred = current_cred();
 	u32 sid = cred_sid(cred);
 
@@ -3664,7 +3671,7 @@  static int selinux_file_mprotect(struct vm_area_struct *vma,
 
 	if (default_noexec &&
 	    (prot & PROT_EXEC) && !(vma->vm_flags & VM_EXEC)) {
-		int rc = 0;
+		rc = 0;
 		if (vma->vm_start >= vma->vm_mm->start_brk &&
 		    vma->vm_end <= vma->vm_mm->brk) {
 			rc = avc_has_perm(&selinux_state,
@@ -3691,6 +3698,12 @@  static int selinux_file_mprotect(struct vm_area_struct *vma,
 			return rc;
 	}
 
+#ifdef CONFIG_INTEL_SGX
+	rc = sgxsec_mprotect(vma, prot);
+	if (rc <= 0)
+		return rc;
+#endif
+
 	return file_map_prot_check(vma->vm_file, prot, vma->vm_flags&VM_SHARED);
 }
 
@@ -6726,6 +6739,62 @@  static void selinux_bpf_prog_free(struct bpf_prog_aux *aux)
 }
 #endif
 
+#ifdef CONFIG_INTEL_SGX
+
+static int selinux_enclave_load(struct file *encl, unsigned long addr,
+				unsigned long size, unsigned long prot,
+				struct vm_area_struct *source)
+{
+	if (source) {
+		/**
+		 * Adding page from source => EADD request
+		 */
+		int rc = selinux_file_mprotect(source, prot, prot);
+		if (rc)
+			return rc;
+
+		if (!(prot & VM_EXEC) &&
+		    selinux_file_mprotect(source, VM_EXEC, VM_EXEC))
+			prot = 0;
+		else {
+			prot = SGX__EXECUTE;
+			if (source->vm_file &&
+			    !file_has_perm(current_cred(), source->vm_file,
+					   FILE__EXECMOD))
+				prot |= SGX__EXECMOD;
+		}
+		return sgxsec_eadd(encl, addr, size, prot);
+	} else {
+		/**
+		  * Adding page from NULL => EAUG request
+		  */
+		return sgxsec_eaug(encl, addr, size, prot);
+	}
+}
+
+static int selinux_enclave_init(struct file *encl,
+				const struct sgx_sigstruct *sigstruct,
+				struct vm_area_struct *vma)
+{
+	int rc = 0;
+
+	if (!vma)
+		rc = -EINVAL;
+
+	if (!rc && !(vma->vm_flags & VM_EXEC))
+		rc = selinux_file_mprotect(vma, VM_EXEC, VM_EXEC);
+
+	if (!rc) {
+		if (vma->vm_file)
+			rc = file_has_perm(current_cred(), vma->vm_file,
+					   FILE__EXECMOD);
+		rc = sgxsec_einit(encl, sigstruct, !rc);
+	}
+	return rc;
+}
+
+#endif
+
 struct lsm_blob_sizes selinux_blob_sizes __lsm_ro_after_init = {
 	.lbs_cred = sizeof(struct task_security_struct),
 	.lbs_file = sizeof(struct file_security_struct),
@@ -6808,6 +6877,7 @@  static struct security_hook_list selinux_hooks[] __lsm_ro_after_init = {
 
 	LSM_HOOK_INIT(file_permission, selinux_file_permission),
 	LSM_HOOK_INIT(file_alloc_security, selinux_file_alloc_security),
+	LSM_HOOK_INIT(file_free_security, selinux_file_free_security),
 	LSM_HOOK_INIT(file_ioctl, selinux_file_ioctl),
 	LSM_HOOK_INIT(mmap_file, selinux_mmap_file),
 	LSM_HOOK_INIT(mmap_addr, selinux_mmap_addr),
@@ -6968,6 +7038,11 @@  static struct security_hook_list selinux_hooks[] __lsm_ro_after_init = {
 	LSM_HOOK_INIT(bpf_map_free_security, selinux_bpf_map_free),
 	LSM_HOOK_INIT(bpf_prog_free_security, selinux_bpf_prog_free),
 #endif
+
+#ifdef CONFIG_INTEL_SGX
+	LSM_HOOK_INIT(enclave_load, selinux_enclave_load),
+	LSM_HOOK_INIT(enclave_init, selinux_enclave_init),
+#endif
 };
 
 static __init int selinux_init(void)
diff --git a/security/selinux/include/intel_sgx.h b/security/selinux/include/intel_sgx.h
new file mode 100644
index 000000000000..8f9c6c734921
--- /dev/null
+++ b/security/selinux/include/intel_sgx.h
@@ -0,0 +1,18 @@ 
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
+// Copyright(c) 2016-18 Intel Corporation.
+
+#ifndef _SELINUX_SGXSEC_H_
+#define _SELINUX_SGXSEC_H_
+
+#include <linux/lsm_hooks.h>
+
+#define SGX__EXECUTE	1
+#define SGX__EXECMOD	2
+
+void sgxsec_enclave_free(struct file *);
+int sgxsec_mprotect(struct vm_area_struct *, size_t);
+int sgxsec_eadd(struct file *, size_t, size_t, size_t);
+int sgxsec_eaug(struct file *, size_t, size_t, size_t);
+int sgxsec_einit(struct file *, const struct sgx_sigstruct *, int);
+
+#endif
diff --git a/security/selinux/include/objsec.h b/security/selinux/include/objsec.h
index 231262d8eac9..0fb4da7e3a8a 100644
--- a/security/selinux/include/objsec.h
+++ b/security/selinux/include/objsec.h
@@ -71,6 +71,9 @@  struct file_security_struct {
 	u32 fown_sid;		/* SID of file owner (for SIGIO) */
 	u32 isid;		/* SID of inode at the time of file open */
 	u32 pseqno;		/* Policy seqno at the time of file open */
+#ifdef CONFIG_INTEL_SGX
+	atomic_long_t esec;
+#endif
 };
 
 struct superblock_security_struct {
diff --git a/security/selinux/intel_sgx.c b/security/selinux/intel_sgx.c
new file mode 100644
index 000000000000..37dacf5c295f
--- /dev/null
+++ b/security/selinux/intel_sgx.c
@@ -0,0 +1,292 @@ 
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
+// Copyright(c) 2016-18 Intel Corporation.
+
+#include "objsec.h"
+#include "intel_sgx.h"
+
+struct region {
+	struct list_head	link;
+	size_t			start;
+	size_t			end;
+	size_t			data;
+};
+
+static inline struct region *region_new(void)
+{
+	struct region *n = kzalloc(sizeof(struct region), GFP_KERNEL);
+	if (n)
+		INIT_LIST_HEAD(&n->link);
+	return n;
+}
+
+static inline void region_free(struct region *r)
+{
+	list_del(&r->link);
+	kfree(r);
+}
+
+static struct list_head *
+region_apply_to_range(struct list_head *rgs,
+		      size_t start, size_t end,
+		      struct list_head *(*cb)(struct region *,
+					      size_t, size_t, size_t),
+		      size_t arg)
+{
+	struct region *r, *n;
+
+	list_for_each_entry(r, rgs, link)
+		if (start < r-> end)
+			break;
+
+	if (&r->link == rgs || end <= r->start)
+		return rgs;
+
+	do {
+		struct list_head *ret;
+		n = list_next_entry(r, link);
+		ret = (*cb)(r, start, end, arg);
+		if (ret)
+			return ret;
+		r = n;
+	} while (&r->link != rgs && r->start < end);
+	return &r->link;
+}
+
+static struct list_head *
+region_clear_cb(struct region *r, size_t start, size_t end, size_t arg)
+{
+	if (end < r->end) {
+		if (start > r->start) {
+			struct region *n = region_new();
+			if (unlikely(!n))
+				return ERR_PTR(-ENOMEM);
+
+			n->start = r->start;
+			n->end = start;
+			n->data = r->data;
+			list_add_tail(&n->link, &r->link);
+		}
+		r->start = end;
+		return &r->link;
+	}
+
+	if (start > r->start)
+		r->end = start;
+	else
+		region_free(r);
+	return NULL;
+}
+
+static inline struct list_head *
+region_clear_range(struct list_head *rgs, size_t start, size_t end)
+{
+	return region_apply_to_range(rgs, start, end, region_clear_cb, 0);
+}
+
+static struct list_head *
+region_add_range(struct list_head *rgs, size_t start, size_t end, size_t data)
+{
+	struct region *r, *n;
+
+	n = list_entry(region_clear_range(rgs, start, end), typeof(*n), link);
+	if (unlikely(IS_ERR_VALUE(&n->link)))
+		return &n->link;
+
+	if (&n->link != rgs && end == n->start && data == n->data) {
+		n->start = start;
+		r = n;
+	} else {
+		r = region_new();
+		if (unlikely(!r))
+			return ERR_PTR(-ENOMEM);
+
+		r->start = start;
+		r->end = end;
+		r->data = data;
+		list_add_tail(&r->link, &n->link);
+	}
+
+	n = list_prev_entry(r, link);
+	if (&n->link != rgs && start == n->end && data == n->data) {
+		r->start = n->start;
+		region_free(n);
+	}
+
+	return &r->link;
+}
+
+static inline int
+enclave_add_pages(struct list_head *rgs, size_t start, size_t end, size_t flags)
+{
+	void *p = region_add_range(rgs, start, end, flags);
+	return PTR_ERR_OR_ZERO(p);
+}
+
+static inline int enclave_prot_allowed(size_t prot, size_t flags)
+{
+	return !(prot & VM_EXEC) || (flags & SGX__EXECUTE);
+}
+
+static struct list_head *
+enclave_prot_check_cb(struct region *r, size_t start, size_t end, size_t prot)
+{
+	if (!enclave_prot_allowed(prot, r->data))
+		return ERR_PTR(-EACCES);
+	return NULL;
+}
+
+static struct list_head *
+enclave_prot_set_cb(struct region *r, size_t start, size_t end, size_t prot)
+{
+	BUG_ON(!enclave_prot_allowed(prot, r->data));
+
+	if (!(prot & VM_WRITE) ||
+	    (r->data & SGX__EXECMOD) ||
+	    !(r->data & SGX__EXECUTE))
+		return NULL;
+
+	if (end < r->end) {
+		struct region *n = region_new();
+		if (unlikely(!n))
+			return ERR_PTR(-ENOMEM);
+
+		n->start = end;
+		n->end = r->end;
+		n->data = r->data;
+		r->end = end;
+		list_add(&n->link, &r->link);
+	}
+
+	if (start > r->start) {
+		struct region *n = region_new();
+		if (unlikely(!n))
+			return ERR_PTR(-ENOMEM);
+
+		n->start = r->start;
+		n->end = start;
+		n->data = r->data;
+		r->start = start;
+		list_add_tail(&n->link, &r->link);
+	}
+
+	r->data &= ~SGX__EXECUTE;
+	return NULL;
+}
+
+static inline int
+enclave_mprotect(struct list_head *rgs, size_t start, size_t end, size_t prot)
+{
+	void *ret;
+
+	ret = region_apply_to_range(rgs, start, end,
+				    enclave_prot_check_cb, prot);
+	if (!IS_ERR_VALUE(ret) && (prot & VM_WRITE))
+		ret = region_apply_to_range(rgs, start, end,
+					    enclave_prot_set_cb, prot);
+	return PTR_ERR_OR_ZERO(ret);
+}
+
+struct enclave_sec {
+	struct rw_semaphore	sem;
+	struct list_head	regions;
+	size_t			eaug_perm;
+};
+
+static inline struct enclave_sec *__esec(struct file_security_struct *fsec)
+{
+	return (struct enclave_sec *)atomic_long_read(&fsec->esec);
+}
+
+static struct enclave_sec *encl_esec(struct file *encl)
+{
+	struct file_security_struct *fsec = selinux_file(encl);
+	struct enclave_sec *esec = __esec(fsec);
+
+	if (unlikely(!esec)) {
+		long n;
+
+		esec = kzalloc(sizeof(*esec), GFP_KERNEL);
+		if (!esec)
+			return NULL;
+
+		init_rwsem(&esec->sem);
+		INIT_LIST_HEAD(&esec->regions);
+
+		n = atomic_long_cmpxchg(&fsec->esec, 0, (long)esec);
+		if (n) {
+			kfree(esec);
+			esec = (typeof(esec))n;
+		}
+	}
+
+	return esec;
+}
+
+void sgxsec_enclave_free(struct file *encl)
+{
+	struct enclave_sec *esec = __esec(selinux_file(encl));
+
+	if (esec) {
+		struct region *r, *n;
+
+		BUG_ON(rwsem_is_locked(&esec->sem));
+
+		list_for_each_entry_safe(r, n, &esec->regions, link)
+			region_free(r);
+
+		kfree(esec);
+	}
+}
+
+int sgxsec_mprotect(struct vm_area_struct *vma, size_t prot)
+{
+	struct enclave_sec *esec;
+	int rc;
+
+	if (!vma->vm_file || !(esec = __esec(selinux_file(vma->vm_file)))) {
+		/* Positive return value indicates non-enclave VMA */
+		return 1;
+	}
+
+	down_read(&esec->sem);
+	rc = enclave_mprotect(&esec->regions, vma->vm_start, vma->vm_end, prot);
+	up_read(&esec->sem);
+	return rc;
+}
+
+int sgxsec_eadd(struct file *encl, size_t start, size_t size, size_t perm)
+{
+	struct enclave_sec *esec = encl_esec(encl);
+	int rc;
+
+	if (down_write_killable(&esec->sem))
+		return -EINTR;
+	rc = enclave_add_pages(&esec->regions, start, start + size, perm);
+	up_write(&esec->sem);
+	return rc;
+}
+
+int sgxsec_eaug(struct file *encl, size_t start, size_t size, size_t prot)
+{
+	struct enclave_sec *esec = encl_esec(encl);
+	int rc = -EPERM;
+
+	if (down_write_killable(&esec->sem))
+		return -EINTR;
+	if (enclave_prot_allowed(prot, esec->eaug_perm))
+		rc = enclave_add_pages(&esec->regions, start, start + size,
+				       esec->eaug_perm);
+	up_write(&esec->sem);
+	return rc;
+}
+
+int sgxsec_einit(struct file *encl, const struct sgx_sigstruct *sigstruct, int execmod)
+{
+	struct enclave_sec *esec = encl_esec(encl);
+
+	if (down_write_killable(&esec->sem))
+		return -EINTR;
+	esec->eaug_perm = execmod ? SGX__EXECUTE | SGX__EXECMOD : 0;
+	up_write(&esec->sem);
+	return 0;
+}