@@ -8,16 +8,158 @@
static struct sgx_cgroup sgx_cg_root;
/**
- * sgx_cgroup_try_charge() - try to charge cgroup for a single EPC page
+ * sgx_cgroup_lru_empty() - check if a cgroup tree has no pages on its LRUs
+ * @root: Root of the tree to check
*
+ * Return: %true if all cgroups under the specified root have empty LRU lists.
+ */
+static bool sgx_cgroup_lru_empty(struct misc_cg *root)
+{
+ struct cgroup_subsys_state *css_root;
+ struct cgroup_subsys_state *pos;
+ struct sgx_cgroup *sgx_cg;
+ bool ret = true;
+
+ /*
+ * Caller must ensure css_root ref acquired
+ */
+ css_root = &root->css;
+
+ rcu_read_lock();
+ css_for_each_descendant_pre(pos, css_root) {
+ if (!css_tryget(pos))
+ break;
+
+ rcu_read_unlock();
+
+ sgx_cg = sgx_cgroup_from_misc_cg(css_misc(pos));
+
+ spin_lock(&sgx_cg->lru.lock);
+ ret = list_empty(&sgx_cg->lru.reclaimable);
+ spin_unlock(&sgx_cg->lru.lock);
+
+ rcu_read_lock();
+ css_put(pos);
+ if (!ret)
+ break;
+ }
+
+ rcu_read_unlock();
+
+ return ret;
+}
+
+/**
+ * sgx_cgroup_reclaim_pages() - reclaim EPC from a cgroup tree
+ * @root: The root of cgroup tree to reclaim from.
+ * @start: The descendant cgroup from which to start the tree walking.
+ *
+ * This function performs a pre-order walk in the cgroup tree under the given
+ * root, starting from the node %start, or from the root if %start is NULL. The
+ * function will attempt to reclaim pages at each node until a fixed number of
+ * pages (%SGX_NR_TO_SCAN) are attempted for reclamation. No guarantee of
+ * success on the actual reclamation process. In extreme cases, if all pages in
+ * front of the LRUs are recently accessed, i.e., considered "too young" to
+ * reclaim, no page will actually be reclaimed after walking the whole tree.
+ *
+ * In some cases, a caller may want to ensure enough reclamation until its
+ * specific need is met. In those cases, the caller should invoke this function
+ * in a loop, and at each iteration passes in the same root and the next node
+ * returned from the previous call as the new %start.
+ *
+ * Return: The next misc cgroup in the subtree to continue the scanning and
+ * attempt for more reclamation from this subtree if needed. Caller must
+ * release the reference if the returned is not used as %start for a subsequent
+ * call.
+ */
+static struct misc_cg *sgx_cgroup_reclaim_pages(struct misc_cg *root, struct misc_cg *start)
+{
+ struct cgroup_subsys_state *css_root, *pos;
+ struct cgroup_subsys_state *next = NULL;
+ struct sgx_cgroup *sgx_cg;
+ unsigned int cnt = 0;
+
+ /* Caller must ensure css_root and start ref's acquired */
+ css_root = &root->css;
+ if (start)
+ pos = &start->css;
+ else
+ pos = css_root;
+
+ while (cnt < SGX_NR_TO_SCAN) {
+ sgx_cg = sgx_cgroup_from_misc_cg(css_misc(pos));
+ cnt += sgx_reclaim_pages(&sgx_cg->lru);
+
+ rcu_read_lock();
+
+ next = css_next_descendant_pre(pos, css_root);
+
+ if (pos != css_root)
+ css_put(pos);
+
+ if (!next || !css_tryget(next)) {
+ /* We are done if next is NULL or not safe to continue
+ * the walk if next is dead. Return NULL and the caller
+ * determines whether to restart from root.
+ */
+ rcu_read_unlock();
+ return NULL;
+ }
+
+ rcu_read_unlock();
+ pos = next;
+ }
+
+ return css_misc(next);
+}
+
+static int __sgx_cgroup_try_charge(struct sgx_cgroup *epc_cg)
+{
+ if (!misc_cg_try_charge(MISC_CG_RES_SGX_EPC, epc_cg->cg, PAGE_SIZE))
+ return 0;
+
+ /* No reclaimable pages left in the cgroup */
+ if (sgx_cgroup_lru_empty(epc_cg->cg))
+ return -ENOMEM;
+
+ if (signal_pending(current))
+ return -ERESTARTSYS;
+
+ return -EBUSY;
+}
+
+/**
+ * sgx_cgroup_try_charge() - try to charge cgroup for a single EPC page
* @sgx_cg: The EPC cgroup to be charged for the page.
+ * @reclaim: Whether or not synchronous EPC reclaim is allowed.
* Return:
* * %0 - If successfully charged.
* * -errno - for failures.
*/
-int sgx_cgroup_try_charge(struct sgx_cgroup *sgx_cg)
+int sgx_cgroup_try_charge(struct sgx_cgroup *sgx_cg, enum sgx_reclaim reclaim)
{
- return misc_cg_try_charge(MISC_CG_RES_SGX_EPC, sgx_cg->cg, PAGE_SIZE);
+ int ret;
+ struct misc_cg *cg_next = NULL;
+
+ for (;;) {
+ ret = __sgx_cgroup_try_charge(sgx_cg);
+
+ if (ret != -EBUSY)
+ goto out;
+
+ if (reclaim == SGX_NO_RECLAIM) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ cg_next = sgx_cgroup_reclaim_pages(sgx_cg->cg, cg_next);
+ cond_resched();
+ }
+
+out:
+ if (cg_next != sgx_cg->cg)
+ put_misc_cg(cg_next);
+ return ret;
}
/**
@@ -42,6 +184,7 @@ static void sgx_cgroup_free(struct misc_cg *cg)
static void sgx_cgroup_misc_init(struct misc_cg *cg, struct sgx_cgroup *sgx_cg)
{
+ sgx_lru_init(&sgx_cg->lru);
cg->res[MISC_CG_RES_SGX_EPC].priv = sgx_cg;
sgx_cg->cg = cg;
}
@@ -20,7 +20,7 @@ static inline struct sgx_cgroup *sgx_get_current_cg(void)
static inline void sgx_put_cg(struct sgx_cgroup *sgx_cg) { }
-static inline int sgx_cgroup_try_charge(struct sgx_cgroup *sgx_cg)
+static inline int sgx_cgroup_try_charge(struct sgx_cgroup *sgx_cg, enum sgx_reclaim reclaim)
{
return 0;
}
@@ -33,6 +33,7 @@ static inline void sgx_cgroup_init(void) { }
struct sgx_cgroup {
struct misc_cg *cg;
+ struct sgx_epc_lru_list lru;
};
static inline struct sgx_cgroup *sgx_cgroup_from_misc_cg(struct misc_cg *cg)
@@ -63,7 +64,7 @@ static inline void sgx_put_cg(struct sgx_cgroup *sgx_cg)
put_misc_cg(sgx_cg->cg);
}
-int sgx_cgroup_try_charge(struct sgx_cgroup *sgx_cg);
+int sgx_cgroup_try_charge(struct sgx_cgroup *sgx_cg, enum sgx_reclaim reclaim);
void sgx_cgroup_uncharge(struct sgx_cgroup *sgx_cg);
void sgx_cgroup_init(void);
@@ -286,11 +286,14 @@ static void sgx_reclaimer_write(struct sgx_epc_page *epc_page,
mutex_unlock(&encl->lock);
}
-/*
- * Take a fixed number of pages from the head of the active page pool and
- * reclaim them to the enclave's private shmem files. Skip the pages, which have
- * been accessed since the last scan. Move those pages to the tail of active
- * page pool so that the pages get scanned in LRU like fashion.
+/**
+ * sgx_reclaim_pages() - Attempt to reclaim a fixed number of pages from an LRU
+ * @lru: The LRU from which pages are reclaimed.
+ *
+ * Take a fixed number of pages from the head of a given LRU and reclaim them to
+ * the enclave's private shmem files. Skip the pages, which have been accessed
+ * since the last scan. Move those pages to the tail of the list so that the
+ * pages get scanned in LRU like fashion.
*
* Batch process a chunk of pages (at the moment 16) in order to degrade amount
* of IPI's and ETRACK's potentially required. sgx_encl_ewb() does degrade a bit
@@ -298,8 +301,10 @@ static void sgx_reclaimer_write(struct sgx_epc_page *epc_page,
* + EWB) but not sufficiently. Reclaiming one page at a time would also be
* problematic as it would increase the lock contention too much, which would
* halt forward progress.
+ *
+ * Return: Number of pages attempted for reclamation.
*/
-static void sgx_reclaim_pages(void)
+unsigned int sgx_reclaim_pages(struct sgx_epc_lru_list *lru)
{
struct sgx_epc_page *chunk[SGX_NR_TO_SCAN];
struct sgx_backing backing[SGX_NR_TO_SCAN];
@@ -310,10 +315,9 @@ static void sgx_reclaim_pages(void)
int ret;
int i;
- spin_lock(&sgx_global_lru.lock);
+ spin_lock(&lru->lock);
for (i = 0; i < SGX_NR_TO_SCAN; i++) {
- epc_page = list_first_entry_or_null(&sgx_global_lru.reclaimable,
- struct sgx_epc_page, list);
+ epc_page = list_first_entry_or_null(&lru->reclaimable, struct sgx_epc_page, list);
if (!epc_page)
break;
@@ -328,7 +332,7 @@ static void sgx_reclaim_pages(void)
*/
epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED;
}
- spin_unlock(&sgx_global_lru.lock);
+ spin_unlock(&lru->lock);
for (i = 0; i < cnt; i++) {
epc_page = chunk[i];
@@ -351,9 +355,9 @@ static void sgx_reclaim_pages(void)
continue;
skip:
- spin_lock(&sgx_global_lru.lock);
- list_add_tail(&epc_page->list, &sgx_global_lru.reclaimable);
- spin_unlock(&sgx_global_lru.lock);
+ spin_lock(&lru->lock);
+ list_add_tail(&epc_page->list, &lru->reclaimable);
+ spin_unlock(&lru->lock);
kref_put(&encl_page->encl->refcount, sgx_encl_release);
@@ -379,14 +383,21 @@ static void sgx_reclaim_pages(void)
sgx_free_epc_page(epc_page);
}
+
+ return cnt;
}
-static bool sgx_should_reclaim(unsigned long watermark)
+static bool sgx_should_reclaim_global(unsigned long watermark)
{
return atomic_long_read(&sgx_nr_free_pages) < watermark &&
!list_empty(&sgx_global_lru.reclaimable);
}
+static void sgx_reclaim_pages_global(void)
+{
+ sgx_reclaim_pages(&sgx_global_lru);
+}
+
/*
* sgx_reclaim_direct() should be called (without enclave's mutex held)
* in locations where SGX memory resources might be low and might be
@@ -394,8 +405,8 @@ static bool sgx_should_reclaim(unsigned long watermark)
*/
void sgx_reclaim_direct(void)
{
- if (sgx_should_reclaim(SGX_NR_LOW_PAGES))
- sgx_reclaim_pages();
+ if (sgx_should_reclaim_global(SGX_NR_LOW_PAGES))
+ sgx_reclaim_pages_global();
}
static int ksgxd(void *p)
@@ -415,10 +426,10 @@ static int ksgxd(void *p)
wait_event_freezable(ksgxd_waitq,
kthread_should_stop() ||
- sgx_should_reclaim(SGX_NR_HIGH_PAGES));
+ sgx_should_reclaim_global(SGX_NR_HIGH_PAGES));
- if (sgx_should_reclaim(SGX_NR_HIGH_PAGES))
- sgx_reclaim_pages();
+ if (sgx_should_reclaim_global(SGX_NR_HIGH_PAGES))
+ sgx_reclaim_pages_global();
cond_resched();
}
@@ -572,7 +583,7 @@ struct sgx_epc_page *sgx_alloc_epc_page(void *owner, enum sgx_reclaim reclaim)
int ret;
sgx_cg = sgx_get_current_cg();
- ret = sgx_cgroup_try_charge(sgx_cg);
+ ret = sgx_cgroup_try_charge(sgx_cg, reclaim);
if (ret) {
sgx_put_cg(sgx_cg);
return ERR_PTR(ret);
@@ -600,7 +611,7 @@ struct sgx_epc_page *sgx_alloc_epc_page(void *owner, enum sgx_reclaim reclaim)
break;
}
- sgx_reclaim_pages();
+ sgx_reclaim_pages_global();
cond_resched();
}
@@ -613,7 +624,7 @@ struct sgx_epc_page *sgx_alloc_epc_page(void *owner, enum sgx_reclaim reclaim)
sgx_put_cg(sgx_cg);
}
- if (sgx_should_reclaim(SGX_NR_LOW_PAGES))
+ if (sgx_should_reclaim_global(SGX_NR_LOW_PAGES))
wake_up(&ksgxd_waitq);
return page;
@@ -135,6 +135,7 @@ void sgx_reclaim_direct(void);
void sgx_mark_page_reclaimable(struct sgx_epc_page *page);
int sgx_unmark_page_reclaimable(struct sgx_epc_page *page);
struct sgx_epc_page *sgx_alloc_epc_page(void *owner, enum sgx_reclaim reclaim);
+unsigned int sgx_reclaim_pages(struct sgx_epc_lru_list *lru);
void sgx_ipi_cb(void *info);