@@ -44,7 +44,6 @@ static int sgx_open(struct inode *inode, struct file *file)
static int sgx_release(struct inode *inode, struct file *file)
{
struct sgx_encl *encl = file->private_data;
- struct sgx_encl_mm *encl_mm;
/*
* Drain the remaining mm_list entries. At this point the list contains
@@ -52,31 +51,7 @@ static int sgx_release(struct inode *inode, struct file *file)
* not exited yet. The processes, which have exited, are gone from the
* list by sgx_mmu_notifier_release().
*/
- for ( ; ; ) {
- spin_lock(&encl->mm_lock);
-
- if (list_empty(&encl->mm_list)) {
- encl_mm = NULL;
- } else {
- encl_mm = list_first_entry(&encl->mm_list,
- struct sgx_encl_mm, list);
- list_del_rcu(&encl_mm->list);
- }
-
- spin_unlock(&encl->mm_lock);
-
- /* The enclave is no longer mapped by any mm. */
- if (!encl_mm)
- break;
-
- synchronize_srcu(&encl->srcu);
- mmu_notifier_unregister(&encl_mm->mmu_notifier, encl_mm->mm);
- kfree(encl_mm);
-
- /* 'encl_mm' is gone, put encl_mm->encl reference: */
- kref_put(&encl->refcount, sgx_encl_release);
- }
-
+ sgx_encl_mm_drain(encl);
kref_put(&encl->refcount, sgx_encl_release);
return 0;
}
@@ -430,6 +430,9 @@ static vm_fault_t sgx_vma_fault(struct vm_fault *vmf)
if (unlikely(!encl))
return VM_FAULT_SIGBUS;
+ if (test_bit(SGX_ENCL_OOM, &encl->flags))
+ return VM_FAULT_SIGBUS;
+
/*
* The page_array keeps track of all enclave pages, whether they
* are swapped out or not. If there is no entry for this page and
@@ -628,7 +631,8 @@ static int sgx_vma_access(struct vm_area_struct *vma, unsigned long addr,
if (!encl)
return -EFAULT;
- if (!test_bit(SGX_ENCL_DEBUG, &encl->flags))
+ if (!test_bit(SGX_ENCL_DEBUG, &encl->flags) ||
+ test_bit(SGX_ENCL_OOM, &encl->flags))
return -EFAULT;
for (i = 0; i < len; i += cnt) {
@@ -753,6 +757,45 @@ void sgx_encl_release(struct kref *ref)
kfree(encl);
}
+/**
+ * sgx_encl_mm_drain - drain all mm_list entries
+ * @encl: address of the sgx_encl to drain
+ *
+ * Used during oom kill to empty the mm_list entries after they have been
+ * zapped. Or used by sgx_release to drain the remaining mm_list entries when
+ * the enclave fd is closing. After this call, sgx_encl_release will be called
+ * with kref_put.
+ */
+void sgx_encl_mm_drain(struct sgx_encl *encl)
+{
+ struct sgx_encl_mm *encl_mm;
+
+ for ( ; ; ) {
+ spin_lock(&encl->mm_lock);
+
+ if (list_empty(&encl->mm_list)) {
+ encl_mm = NULL;
+ } else {
+ encl_mm = list_first_entry(&encl->mm_list,
+ struct sgx_encl_mm, list);
+ list_del_rcu(&encl_mm->list);
+ }
+
+ spin_unlock(&encl->mm_lock);
+
+ /* The enclave is no longer mapped by any mm. */
+ if (!encl_mm)
+ break;
+
+ synchronize_srcu(&encl->srcu);
+ mmu_notifier_unregister(&encl_mm->mmu_notifier, encl_mm->mm);
+ kfree(encl_mm);
+
+ /* 'encl_mm' is gone, put encl_mm->encl reference: */
+ kref_put(&encl->refcount, sgx_encl_release);
+ }
+}
+
/*
* 'mm' is exiting and no longer needs mmu notifications.
*/
@@ -822,6 +865,9 @@ int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm)
struct sgx_encl_mm *encl_mm;
int ret;
+ if (test_bit(SGX_ENCL_OOM, &encl->flags))
+ return -ENOMEM;
+
/*
* Even though a single enclave may be mapped into an mm more than once,
* each 'mm' only appears once on encl->mm_list. This is guaranteed by
@@ -39,6 +39,7 @@ enum sgx_encl_flags {
SGX_ENCL_DEBUG = BIT(1),
SGX_ENCL_CREATED = BIT(2),
SGX_ENCL_INITIALIZED = BIT(3),
+ SGX_ENCL_OOM = BIT(4),
};
struct sgx_encl_mm {
@@ -125,5 +126,6 @@ struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl,
unsigned long addr);
struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl, bool reclaim);
void sgx_encl_shrink(struct sgx_encl *encl, struct sgx_va_page *va_page);
+void sgx_encl_mm_drain(struct sgx_encl *encl);
#endif /* _X86_ENCL_H */
@@ -420,6 +420,9 @@ static long sgx_ioc_enclave_add_pages(struct sgx_encl *encl, void __user *arg)
test_bit(SGX_ENCL_INITIALIZED, &encl->flags))
return -EINVAL;
+ if (test_bit(SGX_ENCL_OOM, &encl->flags))
+ return -ENOMEM;
+
if (copy_from_user(&add_arg, arg, sizeof(add_arg)))
return -EFAULT;
@@ -605,6 +608,9 @@ static long sgx_ioc_enclave_init(struct sgx_encl *encl, void __user *arg)
test_bit(SGX_ENCL_INITIALIZED, &encl->flags))
return -EINVAL;
+ if (test_bit(SGX_ENCL_OOM, &encl->flags))
+ return -ENOMEM;
+
if (copy_from_user(&init_arg, arg, sizeof(init_arg)))
return -EFAULT;
@@ -681,6 +687,9 @@ static int sgx_ioc_sgx2_ready(struct sgx_encl *encl)
if (!test_bit(SGX_ENCL_INITIALIZED, &encl->flags))
return -EINVAL;
+ if (test_bit(SGX_ENCL_OOM, &encl->flags))
+ return -ENOMEM;
+
return 0;
}
@@ -662,6 +662,146 @@ void sgx_free_epc_page(struct sgx_epc_page *page)
atomic_long_inc(&sgx_nr_free_pages);
}
+static bool sgx_oom_get_ref(struct sgx_epc_page *epc_page)
+{
+ struct sgx_encl *encl;
+
+ if (epc_page->flags & SGX_EPC_OWNER_ENCL_PAGE)
+ encl = epc_page->encl_page->encl;
+ else if (epc_page->flags & SGX_EPC_OWNER_ENCL)
+ encl = epc_page->encl;
+ else
+ return false;
+
+ return kref_get_unless_zero(&encl->refcount);
+}
+
+static struct sgx_epc_page *sgx_oom_get_victim(struct sgx_epc_lru_lists *lru)
+{
+ struct sgx_epc_page *epc_page, *tmp;
+
+ if (list_empty(&lru->unreclaimable))
+ return NULL;
+
+ list_for_each_entry_safe(epc_page, tmp, &lru->unreclaimable, list) {
+ list_del_init(&epc_page->list);
+
+ if (sgx_oom_get_ref(epc_page))
+ return epc_page;
+ }
+ return NULL;
+}
+
+static void sgx_epc_oom_zap(void *owner, struct mm_struct *mm, unsigned long start,
+ unsigned long end, const struct vm_operations_struct *ops)
+{
+ VMA_ITERATOR(vmi, mm, start);
+ struct vm_area_struct *vma;
+
+ /**
+ * Use end because start can be zero and not mapped into
+ * enclave even if encl->base = 0.
+ */
+ for_each_vma_range(vmi, vma, end) {
+ if (vma->vm_ops == ops && vma->vm_private_data == owner &&
+ vma->vm_start < end) {
+ zap_vma_pages(vma);
+ }
+ }
+}
+
+static bool sgx_oom_encl(struct sgx_encl *encl)
+{
+ unsigned long mm_list_version;
+ struct sgx_encl_mm *encl_mm;
+ bool ret = false;
+ int idx;
+
+ if (!test_bit(SGX_ENCL_CREATED, &encl->flags))
+ goto out_put;
+
+ /* Done OOM on this enclave previously, do not redo it.
+ * This may happen when the SECS page is still UNCLRAIMABLE because
+ * another page is in RECLAIM_IN_PROGRESS. Still return true so OOM
+ * killer can wait until the reclaimer done with the hold-up page and
+ * SECS before it move on to find another victim.
+ */
+ if (test_bit(SGX_ENCL_OOM, &encl->flags))
+ goto out;
+
+ set_bit(SGX_ENCL_OOM, &encl->flags);
+
+ do {
+ mm_list_version = encl->mm_list_version;
+
+ /* Pairs with smp_rmb() in sgx_encl_mm_add(). */
+ smp_rmb();
+
+ idx = srcu_read_lock(&encl->srcu);
+
+ list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) {
+ if (!mmget_not_zero(encl_mm->mm))
+ continue;
+
+ mmap_read_lock(encl_mm->mm);
+
+ sgx_epc_oom_zap(encl, encl_mm->mm, encl->base,
+ encl->base + encl->size, &sgx_vm_ops);
+
+ mmap_read_unlock(encl_mm->mm);
+
+ mmput_async(encl_mm->mm);
+ }
+
+ srcu_read_unlock(&encl->srcu, idx);
+ } while (WARN_ON_ONCE(encl->mm_list_version != mm_list_version));
+
+ sgx_encl_mm_drain(encl);
+out:
+ ret = true;
+
+out_put:
+ /*
+ * This puts the refcount we took when we identified this enclave as
+ * an OOM victim.
+ */
+ kref_put(&encl->refcount, sgx_encl_release);
+ return ret;
+}
+
+static inline bool sgx_oom_encl_page(struct sgx_encl_page *encl_page)
+{
+ return sgx_oom_encl(encl_page->encl);
+}
+
+/**
+ * sgx_epc_oom() - invoke EPC out-of-memory handling on target LRU
+ * @lru: LRU that is low
+ *
+ * Return: %true if a victim was found and kicked.
+ */
+bool sgx_epc_oom(struct sgx_epc_lru_lists *lru)
+{
+ struct sgx_epc_page *victim;
+
+ spin_lock(&lru->lock);
+ victim = sgx_oom_get_victim(lru);
+ spin_unlock(&lru->lock);
+
+ if (!victim)
+ return false;
+
+ if (victim->flags & SGX_EPC_OWNER_ENCL_PAGE)
+ return sgx_oom_encl_page(victim->encl_page);
+
+ if (victim->flags & SGX_EPC_OWNER_ENCL)
+ return sgx_oom_encl(victim->encl);
+
+ /*Will never happen unless we add more owner types in future */
+ WARN_ON_ONCE(1);
+ return false;
+}
+
static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size,
unsigned long index,
struct sgx_epc_section *section)
@@ -178,6 +178,7 @@ struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim);
size_t sgx_reclaim_epc_pages(size_t nr_to_scan, bool ignore_age);
void sgx_isolate_epc_pages(struct sgx_epc_lru_lists *lrus, size_t nr_to_scan,
struct list_head *dst);
+bool sgx_epc_oom(struct sgx_epc_lru_lists *lrus);
void sgx_ipi_cb(void *info);