@@ -5,6 +5,38 @@
#include <linux/kernel.h>
#include "epc_cgroup.h"
+#define SGX_EPC_RECLAIM_MIN_PAGES 16U
+
+static struct workqueue_struct *sgx_epc_cg_wq;
+
+static inline u64 sgx_epc_cgroup_page_counter_read(struct sgx_epc_cgroup *epc_cg)
+{
+ return atomic64_read(&epc_cg->cg->res[MISC_CG_RES_SGX_EPC].usage) / PAGE_SIZE;
+}
+
+static inline u64 sgx_epc_cgroup_max_pages(struct sgx_epc_cgroup *epc_cg)
+{
+ return READ_ONCE(epc_cg->cg->res[MISC_CG_RES_SGX_EPC].max) / PAGE_SIZE;
+}
+
+/*
+ * Get the lower bound of limits of a cgroup and its ancestors. Used in
+ * sgx_epc_cgroup_reclaim_work_func() to determine if EPC usage of a cgroup is over its limit
+ * or its ancestors' hence reclamation is needed.
+ */
+static inline u64 sgx_epc_cgroup_max_pages_to_root(struct sgx_epc_cgroup *epc_cg)
+{
+ struct misc_cg *i = epc_cg->cg;
+ u64 m = U64_MAX;
+
+ while (i) {
+ m = min(m, READ_ONCE(i->res[MISC_CG_RES_SGX_EPC].max));
+ i = misc_cg_parent(i);
+ }
+
+ return m / PAGE_SIZE;
+}
+
static inline struct sgx_epc_cgroup *sgx_epc_cgroup_from_misc_cg(struct misc_cg *cg)
{
return (struct sgx_epc_cgroup *)(cg->res[MISC_CG_RES_SGX_EPC].priv);
@@ -15,12 +47,188 @@ static inline bool sgx_epc_cgroup_disabled(void)
return !cgroup_subsys_enabled(misc_cgrp_subsys);
}
+/**
+ * sgx_epc_cgroup_lru_empty() - check if a cgroup tree has no pages on its LRUs
+ * @root: root of the tree to check
+ *
+ * Return: %true if all cgroups under the specified root have empty LRU lists.
+ * Used to avoid livelocks due to a cgroup having a non-zero charge count but
+ * no pages on its LRUs, e.g. due to a dead enclave waiting to be released or
+ * because all pages in the cgroup are unreclaimable.
+ */
+bool sgx_epc_cgroup_lru_empty(struct misc_cg *root)
+{
+ struct cgroup_subsys_state *css_root;
+ struct cgroup_subsys_state *pos;
+ struct sgx_epc_cgroup *epc_cg;
+ bool ret = true;
+
+ /*
+ * Caller ensure css_root ref acquired
+ */
+ css_root = &root->css;
+
+ rcu_read_lock();
+ css_for_each_descendant_pre(pos, css_root) {
+ if (!css_tryget(pos))
+ break;
+
+ rcu_read_unlock();
+
+ epc_cg = sgx_epc_cgroup_from_misc_cg(css_misc(pos));
+
+ spin_lock(&epc_cg->lru.lock);
+ ret = list_empty(&epc_cg->lru.reclaimable);
+ spin_unlock(&epc_cg->lru.lock);
+
+ rcu_read_lock();
+ css_put(pos);
+ if (!ret)
+ break;
+ }
+
+ rcu_read_unlock();
+
+ return ret;
+}
+
+/**
+ * sgx_epc_cgroup_isolate_pages() - walk a cgroup tree and scan LRUs to select pages for
+ * reclamation
+ * @root: root of the tree to start walking
+ * @nr_to_scan: The number of pages to scan
+ * @dst: Destination list to hold the isolated pages
+ */
+void sgx_epc_cgroup_isolate_pages(struct misc_cg *root,
+ unsigned int nr_to_scan, struct list_head *dst)
+{
+ struct cgroup_subsys_state *css_root;
+ struct cgroup_subsys_state *pos;
+ struct sgx_epc_cgroup *epc_cg;
+
+ if (!nr_to_scan)
+ return;
+
+ /* Caller ensure css_root ref acquired */
+ css_root = &root->css;
+
+ rcu_read_lock();
+ css_for_each_descendant_pre(pos, css_root) {
+ if (!css_tryget(pos))
+ break;
+ rcu_read_unlock();
+
+ epc_cg = sgx_epc_cgroup_from_misc_cg(css_misc(pos));
+ nr_to_scan = sgx_isolate_epc_pages(&epc_cg->lru, nr_to_scan, dst);
+
+ rcu_read_lock();
+ css_put(pos);
+ if (!nr_to_scan)
+ break;
+ }
+
+ rcu_read_unlock();
+}
+
+static unsigned int sgx_epc_cgroup_reclaim_pages(unsigned int nr_pages,
+ struct misc_cg *root)
+{
+ LIST_HEAD(iso);
+ /*
+ * Attempting to reclaim only a few pages will often fail and is inefficient, while
+ * reclaiming a huge number of pages can result in soft lockups due to holding various
+ * locks for an extended duration.
+ */
+ nr_pages = max(nr_pages, SGX_EPC_RECLAIM_MIN_PAGES);
+ nr_pages = min(nr_pages, SGX_NR_TO_SCAN_MAX);
+ sgx_epc_cgroup_isolate_pages(root, nr_pages, &iso);
+
+ return sgx_do_epc_reclamation(&iso);
+}
+
+/*
+ * Scheduled by sgx_epc_cgroup_try_charge() to reclaim pages from the cgroup when the cgroup is
+ * at/near its maximum capacity
+ */
+static void sgx_epc_cgroup_reclaim_work_func(struct work_struct *work)
+{
+ struct sgx_epc_cgroup *epc_cg;
+ u64 cur, max;
+
+ epc_cg = container_of(work, struct sgx_epc_cgroup, reclaim_work);
+
+ for (;;) {
+ max = sgx_epc_cgroup_max_pages_to_root(epc_cg);
+
+ /*
+ * Adjust the limit down by one page, the goal is to free up
+ * pages for fault allocations, not to simply obey the limit.
+ * Conditionally decrementing max also means the cur vs. max
+ * check will correctly handle the case where both are zero.
+ */
+ if (max)
+ max--;
+
+ /*
+ * Unless the limit is extremely low, in which case forcing
+ * reclaim will likely cause thrashing, force the cgroup to
+ * reclaim at least once if it's operating *near* its maximum
+ * limit by adjusting @max down by half the min reclaim size.
+ * This work func is scheduled by sgx_epc_cgroup_try_charge
+ * when it cannot directly reclaim due to being in an atomic
+ * context, e.g. EPC allocation in a fault handler. Waiting
+ * to reclaim until the cgroup is actually at its limit is less
+ * performant as it means the faulting task is effectively
+ * blocked until a worker makes its way through the global work
+ * queue.
+ */
+ if (max > SGX_NR_TO_SCAN_MAX)
+ max -= (SGX_EPC_RECLAIM_MIN_PAGES / 2);
+
+ cur = sgx_epc_cgroup_page_counter_read(epc_cg);
+
+ if (cur <= max || sgx_epc_cgroup_lru_empty(epc_cg->cg))
+ break;
+
+ /* Keep reclaiming until above condition is met. */
+ sgx_epc_cgroup_reclaim_pages((unsigned int)(cur - max), epc_cg->cg);
+ }
+}
+
+static int __sgx_epc_cgroup_try_charge(struct sgx_epc_cgroup *epc_cg,
+ bool reclaim)
+{
+ for (;;) {
+ if (!misc_cg_try_charge(MISC_CG_RES_SGX_EPC, epc_cg->cg,
+ PAGE_SIZE))
+ break;
+
+ if (sgx_epc_cgroup_lru_empty(epc_cg->cg))
+ return -ENOMEM;
+
+ if (signal_pending(current))
+ return -ERESTARTSYS;
+
+ if (!reclaim) {
+ queue_work(sgx_epc_cg_wq, &epc_cg->reclaim_work);
+ return -EBUSY;
+ }
+
+ if (!sgx_epc_cgroup_reclaim_pages(1, epc_cg->cg))
+ /* All pages were too young to reclaim, try again */
+ schedule();
+ }
+
+ return 0;
+}
+
/**
* sgx_epc_cgroup_try_charge() - hierarchically try to charge a single EPC page
+ * @reclaim: whether or not synchronous reclaim is allowed
*
* Returns EPC cgroup or NULL on success, -errno on failure.
*/
-struct sgx_epc_cgroup *sgx_epc_cgroup_try_charge(void)
+struct sgx_epc_cgroup *sgx_epc_cgroup_try_charge(bool reclaim)
{
struct sgx_epc_cgroup *epc_cg;
int ret;
@@ -29,12 +237,12 @@ struct sgx_epc_cgroup *sgx_epc_cgroup_try_charge(void)
return NULL;
epc_cg = sgx_epc_cgroup_from_misc_cg(get_current_misc_cg());
- ret = misc_cg_try_charge(MISC_CG_RES_SGX_EPC, epc_cg->cg, PAGE_SIZE);
+ ret = __sgx_epc_cgroup_try_charge(epc_cg, reclaim);
- if (!ret) {
+ if (ret) {
/* No epc_cg returned, release ref from get_current_misc_cg() */
put_misc_cg(epc_cg->cg);
- return ERR_PTR(-ENOMEM);
+ return ERR_PTR(ret);
}
/* Ref released in sgx_epc_cgroup_uncharge() */
@@ -64,6 +272,7 @@ static void sgx_epc_cgroup_free(struct misc_cg *cg)
if (!epc_cg)
return;
+ cancel_work_sync(&epc_cg->reclaim_work);
kfree(epc_cg);
}
@@ -82,6 +291,8 @@ static int sgx_epc_cgroup_alloc(struct misc_cg *cg)
if (!epc_cg)
return -ENOMEM;
+ sgx_lru_init(&epc_cg->lru);
+ INIT_WORK(&epc_cg->reclaim_work, sgx_epc_cgroup_reclaim_work_func);
cg->res[MISC_CG_RES_SGX_EPC].misc_ops = &sgx_epc_cgroup_ops;
cg->res[MISC_CG_RES_SGX_EPC].priv = epc_cg;
epc_cg->cg = cg;
@@ -95,6 +306,11 @@ static int __init sgx_epc_cgroup_init(void)
if (!boot_cpu_has(X86_FEATURE_SGX))
return 0;
+ sgx_epc_cg_wq = alloc_workqueue("sgx_epc_cg_wq",
+ WQ_UNBOUND | WQ_FREEZABLE,
+ WQ_UNBOUND_MAX_ACTIVE);
+ BUG_ON(!sgx_epc_cg_wq);
+
cg = misc_cg_root();
BUG_ON(!cg);
@@ -16,20 +16,33 @@
#define MISC_CG_RES_SGX_EPC MISC_CG_RES_TYPES
struct sgx_epc_cgroup;
-static inline struct sgx_epc_cgroup *sgx_epc_cgroup_try_charge(void)
+static inline struct sgx_epc_cgroup *sgx_epc_cgroup_try_charge(bool reclaim)
{
return NULL;
}
static inline void sgx_epc_cgroup_uncharge(struct sgx_epc_cgroup *epc_cg) { }
+
+static inline void sgx_epc_cgroup_isolate_pages(struct misc_cg *root,
+ unsigned int nr_to_scan,
+ struct list_head *dst) { }
+
+static bool sgx_epc_cgroup_lru_empty(struct misc_cg *root)
+{
+ return true;
+}
#else
struct sgx_epc_cgroup {
- struct misc_cg *cg;
+ struct misc_cg *cg;
+ struct sgx_epc_lru_list lru;
+ struct work_struct reclaim_work;
};
-struct sgx_epc_cgroup *sgx_epc_cgroup_try_charge(void);
+struct sgx_epc_cgroup *sgx_epc_cgroup_try_charge(bool reclaim);
void sgx_epc_cgroup_uncharge(struct sgx_epc_cgroup *epc_cg);
bool sgx_epc_cgroup_lru_empty(struct misc_cg *root);
+void sgx_epc_cgroup_isolate_pages(struct misc_cg *root, unsigned int nr_to_scan,
+ struct list_head *dst);
#endif
@@ -32,6 +32,31 @@ static DEFINE_XARRAY(sgx_epc_address_space);
*/
static struct sgx_epc_lru_list sgx_global_lru;
+#ifndef CONFIG_CGROUP_SGX_EPC
+static inline struct sgx_epc_lru_list *sgx_lru_list(struct sgx_epc_page *epc_page)
+{
+ return &sgx_global_lru;
+}
+#else
+static inline struct sgx_epc_lru_list *sgx_lru_list(struct sgx_epc_page *epc_page)
+{
+ if (epc_page->epc_cg)
+ return &epc_page->epc_cg->lru;
+
+ /* This should not happen if kernel is configured correctly */
+ WARN_ON_ONCE(1);
+ return &sgx_global_lru;
+}
+#endif
+
+static inline bool sgx_can_reclaim(void)
+{
+ if (IS_ENABLED(CONFIG_CGROUP_SGX_EPC))
+ return !sgx_epc_cgroup_lru_empty(misc_cg_root());
+
+ return !list_empty(&sgx_global_lru.reclaimable);
+}
+
static atomic_long_t sgx_nr_free_pages = ATOMIC_LONG_INIT(0);
/* Nodes with one or more EPC sections. */
@@ -342,6 +367,7 @@ unsigned int sgx_do_epc_reclamation(struct list_head *iso)
struct sgx_backing backing[SGX_NR_TO_SCAN_MAX];
struct sgx_epc_page *epc_page, *tmp;
struct sgx_encl_page *encl_page;
+ struct sgx_epc_lru_list *lru;
pgoff_t page_index;
size_t ret, i;
@@ -370,10 +396,11 @@ unsigned int sgx_do_epc_reclamation(struct list_head *iso)
continue;
skip:
- spin_lock(&sgx_global_lru.lock);
+ lru = sgx_lru_list(epc_page);
+ spin_lock(&lru->lock);
sgx_epc_page_set_state(epc_page, SGX_EPC_PAGE_RECLAIMABLE);
- list_move_tail(&epc_page->list, &sgx_global_lru.reclaimable);
- spin_unlock(&sgx_global_lru.lock);
+ list_move_tail(&epc_page->list, &lru->reclaimable);
+ spin_unlock(&lru->lock);
kref_put(&encl_page->encl->refcount, sgx_encl_release);
}
@@ -397,9 +424,13 @@ unsigned int sgx_do_epc_reclamation(struct list_head *iso)
static void sgx_reclaim_epc_pages_global(void)
{
+ unsigned int nr_to_scan = SGX_NR_TO_SCAN;
LIST_HEAD(iso);
- sgx_isolate_epc_pages(&sgx_global_lru, SGX_NR_TO_SCAN, &iso);
+ if (IS_ENABLED(CONFIG_CGROUP_SGX_EPC))
+ sgx_epc_cgroup_isolate_pages(misc_cg_root(), nr_to_scan, &iso);
+ else
+ sgx_isolate_epc_pages(&sgx_global_lru, nr_to_scan, &iso);
sgx_do_epc_reclamation(&iso);
}
@@ -407,7 +438,7 @@ static void sgx_reclaim_epc_pages_global(void)
static bool sgx_should_reclaim(unsigned long watermark)
{
return atomic_long_read(&sgx_nr_free_pages) < watermark &&
- !list_empty(&sgx_global_lru.reclaimable);
+ sgx_can_reclaim();
}
/*
@@ -528,26 +559,26 @@ struct sgx_epc_page *__sgx_alloc_epc_page(void)
}
/**
- * sgx_mark_page_reclaimable() - Mark a page as reclaimable
+ * sgx_mark_page_reclaimable() - Mark a page as reclaimable and add it to an appropriate LRU
* @page: EPC page
*
- * Mark a page as reclaimable and add it to the active page list. Pages
- * are automatically removed from the active list when freed.
*/
void sgx_mark_page_reclaimable(struct sgx_epc_page *page)
{
- spin_lock(&sgx_global_lru.lock);
+ struct sgx_epc_lru_list *lru = sgx_lru_list(page);
+
+ spin_lock(&lru->lock);
WARN_ON_ONCE(sgx_epc_page_reclaimable(page->flags));
page->flags |= SGX_EPC_PAGE_RECLAIMABLE;
- list_add_tail(&page->list, &sgx_global_lru.reclaimable);
- spin_unlock(&sgx_global_lru.lock);
+ list_add_tail(&page->list, &lru->reclaimable);
+ spin_unlock(&lru->lock);
}
/**
* sgx_unmark_page_reclaimable() - Remove a page from the reclaim list
* @page: EPC page
*
- * Clear the reclaimable flag and remove the page from the active page list.
+ * Clear the reclaimable flag if set and remove the page from its LRU.
*
* Return:
* 0 on success,
@@ -555,15 +586,17 @@ void sgx_mark_page_reclaimable(struct sgx_epc_page *page)
*/
int sgx_unmark_page_reclaimable(struct sgx_epc_page *page)
{
- spin_lock(&sgx_global_lru.lock);
+ struct sgx_epc_lru_list *lru = sgx_lru_list(page);
+
+ spin_lock(&lru->lock);
if (sgx_epc_page_reclaim_in_progress(page->flags)) {
- spin_unlock(&sgx_global_lru.lock);
+ spin_unlock(&lru->lock);
return -EBUSY;
}
list_del(&page->list);
sgx_epc_page_reset_state(page);
- spin_unlock(&sgx_global_lru.lock);
+ spin_unlock(&lru->lock);
return 0;
}
@@ -590,7 +623,7 @@ struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim)
struct sgx_epc_page *page;
struct sgx_epc_cgroup *epc_cg;
- epc_cg = sgx_epc_cgroup_try_charge();
+ epc_cg = sgx_epc_cgroup_try_charge(reclaim);
if (IS_ERR(epc_cg))
return ERR_CAST(epc_cg);
@@ -601,8 +634,10 @@ struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim)
break;
}
- if (list_empty(&sgx_global_lru.reclaimable))
- return ERR_PTR(-ENOMEM);
+ if (!sgx_can_reclaim()) {
+ page = ERR_PTR(-ENOMEM);
+ break;
+ }
if (!reclaim) {
page = ERR_PTR(-EBUSY);