@@ -1919,6 +1919,7 @@ config INTEL_SGX
bool "Intel SGX core functionality"
depends on X86_64 && CPU_SUP_INTEL
select SRCU
+ select MMU_NOTIFIER
---help---
Intel(R) SGX is a set of CPU instructions that can be used by
applications to set aside private regions of code and data, referred
@@ -53,6 +53,32 @@ static int sgx_open(struct inode *inode, struct file *file)
static int sgx_release(struct inode *inode, struct file *file)
{
struct sgx_encl *encl = file->private_data;
+ struct sgx_encl_mm *encl_mm;
+
+ /*
+ * Objects can't be *moved* off an RCU protected list (deletion is ok),
+ * nor can the object be freed until after synchronize_srcu().
+ */
+restart:
+ spin_lock(&encl->mm_lock);
+ if (list_empty(&encl->mm_list)) {
+ encl_mm = NULL;
+ } else {
+ encl_mm = list_first_entry(&encl->mm_list, struct sgx_encl_mm,
+ list);
+ list_del_rcu(&encl_mm->list);
+ }
+ spin_unlock(&encl->mm_lock);
+
+ if (encl_mm) {
+ synchronize_srcu(&encl->srcu);
+
+ mmu_notifier_unregister(&encl_mm->mmu_notifier, encl_mm->mm);
+
+ kfree(encl_mm);
+
+ goto restart;
+ }
mutex_lock(&encl->lock);
encl->flags |= SGX_ENCL_DEAD;
@@ -132,45 +132,51 @@ static struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl,
return entry;
}
-static void sgx_encl_mm_release_deferred(struct work_struct *work)
+static void sgx_encl_mm_release_deferred(struct rcu_head *rcu)
{
struct sgx_encl_mm *encl_mm =
- container_of(work, struct sgx_encl_mm, release_work);
+ container_of(rcu, struct sgx_encl_mm, rcu);
- mmdrop(encl_mm->mm);
- synchronize_srcu(&encl_mm->encl->srcu);
kfree(encl_mm);
}
-static void sgx_encl_mm_release(struct kref *ref)
+static void sgx_mmu_notifier_release(struct mmu_notifier *mn,
+ struct mm_struct *mm)
{
struct sgx_encl_mm *encl_mm =
- container_of(ref, struct sgx_encl_mm, refcount);
+ container_of(mn, struct sgx_encl_mm, mmu_notifier);
+ struct sgx_encl_mm *tmp = NULL;
+ /*
+ * The enclave itself can remove encl_mm. Note, objects can't be moved
+ * off an RCU protected list, but deletion is ok.
+ */
spin_lock(&encl_mm->encl->mm_lock);
- list_del_rcu(&encl_mm->list);
+ list_for_each_entry(tmp, &encl_mm->encl->mm_list, list) {
+ if (tmp == encl_mm) {
+ list_del_rcu(&encl_mm->list);
+ break;
+ }
+ }
spin_unlock(&encl_mm->encl->mm_lock);
- /*
- * If the mm has users, then do SRCU synchronization in a worker thread
- * to avoid a deadlock with the reclaimer. The caller holds mmap_sem
- * for write, while the reclaimer will acquire mmap_sem for read if it
- * is reclaiming from this enclave. Invoking synchronize_srcu() here
- * will hang waiting for the reclaimer to drop its RCU read lock, while
- * the reclaimer will get stuck waiting to acquire mmap_sem. The async
- * shenanigans can be avoided if there are no mm users as the reclaimer
- * will not acquire mmap_sem in that case.
- */
- if (atomic_read(&encl_mm->mm->mm_users)) {
- mmgrab(encl_mm->mm);
- INIT_WORK(&encl_mm->release_work, sgx_encl_mm_release_deferred);
- schedule_work(&encl_mm->release_work);
- } else {
+ if (tmp == encl_mm) {
synchronize_srcu(&encl_mm->encl->srcu);
- kfree(encl_mm);
+
+ /*
+ * Delay freeing encl_mm until after mmu_notifier synchronizes
+ * its SRCU to ensure encl_mm cannot be dereferenced.
+ */
+ mmu_notifier_unregister_no_release(mn, mm);
+ mmu_notifier_call_srcu(&encl_mm->rcu,
+ &sgx_encl_mm_release_deferred);
}
}
+static const struct mmu_notifier_ops sgx_mmu_notifier_ops = {
+ .release = sgx_mmu_notifier_release,
+};
+
static struct sgx_encl_mm *sgx_encl_find_mm(struct sgx_encl *encl,
struct mm_struct *mm)
{
@@ -195,6 +201,7 @@ static struct sgx_encl_mm *sgx_encl_find_mm(struct sgx_encl *encl,
int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm)
{
struct sgx_encl_mm *encl_mm;
+ int ret;
lockdep_assert_held_exclusive(&mm->mmap_sem);
@@ -202,12 +209,11 @@ int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm)
return -EINVAL;
/*
- * A dying encl_mm can be observed when synchronize_srcu() is called
- * asynchronously via sgx_encl_mm_release(), e.g. if mmap() closely
- * follows munmap().
+ * mm_structs are kept on mm_list until the mm or the enclave dies,
+ * i.e. once an mm is off the list, it's gone for good, therefore it's
+ * impossible to get a false positive on @mm due to a stale mm_list.
*/
- encl_mm = sgx_encl_find_mm(encl, mm);
- if (encl_mm && kref_get_unless_zero(&encl_mm->refcount))
+ if (sgx_encl_find_mm(encl, mm))
return 0;
encl_mm = kzalloc(sizeof(*encl_mm), GFP_KERNEL);
@@ -216,18 +222,18 @@ int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm)
encl_mm->encl = encl;
encl_mm->mm = mm;
- kref_init(&encl_mm->refcount);
+ encl_mm->mmu_notifier.ops = &sgx_mmu_notifier_ops;
+
+ ret = __mmu_notifier_register(&encl_mm->mmu_notifier, mm);
+ if (ret) {
+ kfree(encl_mm);
+ return ret;
+ }
spin_lock(&encl->mm_lock);
list_add_rcu(&encl_mm->list, &encl->mm_list);
spin_unlock(&encl->mm_lock);
- /*
- * Note, in addition to ensuring reclaim sees all encl_mms that have a
- * valid mapping, this synchronize_srcu() also ensures that at most one
- * matching encl_mm is encountered by the refcouting flows, i.e. a live
- * encl_mm isn't hiding behind a dying encl_mm.
- */
synchronize_srcu(&encl->srcu);
return 0;
@@ -244,18 +250,6 @@ static void sgx_vma_open(struct vm_area_struct *vma)
vma->vm_private_data = NULL;
}
-static void sgx_vma_close(struct vm_area_struct *vma)
-{
- struct sgx_encl *encl = vma->vm_private_data;
- struct sgx_encl_mm *encl_mm;
-
- if (!encl)
- return;
-
- encl_mm = sgx_encl_find_mm(encl, vma->vm_mm);
- kref_put(&encl_mm->refcount, sgx_encl_mm_release);
-}
-
static unsigned int sgx_vma_fault(struct vm_fault *vmf)
{
unsigned long addr = (unsigned long)vmf->address;
@@ -391,7 +385,6 @@ static int sgx_vma_access(struct vm_area_struct *vma, unsigned long addr,
}
const struct vm_operations_struct sgx_vm_ops = {
- .close = sgx_vma_close,
.open = sgx_vma_open,
.fault = sgx_vma_fault,
.access = sgx_vma_access,
@@ -9,6 +9,7 @@
#include <linux/kref.h>
#include <linux/list.h>
#include <linux/mm_types.h>
+#include <linux/mmu_notifier.h>
#include <linux/mutex.h>
#include <linux/notifier.h>
#include <linux/radix-tree.h>
@@ -58,9 +59,9 @@ enum sgx_encl_flags {
struct sgx_encl_mm {
struct sgx_encl *encl;
struct mm_struct *mm;
- struct kref refcount;
struct list_head list;
- struct work_struct release_work;
+ struct mmu_notifier mmu_notifier;
+ struct rcu_head rcu;
};
struct sgx_encl {
Using per-vma refcounting to track mm_structs associated with an enclave requires hooking .vm_close(), which in turn prevents the mm from merging vmas (precisely to allow refcounting). Avoid refcounting encl_mm altogether by registering an mmu_notifier at .mmap(), removing the dying encl_mm at mmu_notifier.release() and protecting mm_list during reclaim via a per-enclave SRCU. Removing refcounting/vm_close() allows merging of enclave vmas, at the cost of delaying removal of encl_mm structs from mm_list, i.e. an mm is disassociated from an enclave when the mm exits or the enclave dies, as opposed to when the last vma (in a given mm) is closed. The impact of delying encl_mm removal is its memory footprint and whatever overhead is incurred during EPC reclaim (to walk an mm's vmas). Practically speaking, a stale encl_mm will exist for a meaningful amount of time if and only if the enclave is mapped in a long-lived process and then passed off to another long-lived process. It is expected that the vast majority of use cases will not encounter this condition, e.g. even using a daemon to build enclaves should not result in a stale encl_mm as the builder should never need to mmap() the enclave. Even if there are scenarios that lead to defunct encl_mms, the cost is likely far outweighed by the benefits of reducing the number of vmas across all enclaves. Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com> --- arch/x86/Kconfig | 1 + arch/x86/kernel/cpu/sgx/driver/main.c | 26 ++++++++ arch/x86/kernel/cpu/sgx/encl.c | 89 ++++++++++++--------------- arch/x86/kernel/cpu/sgx/encl.h | 5 +- 4 files changed, 71 insertions(+), 50 deletions(-)