Message ID | 20240130020938.10025-9-haitao.huang@linux.intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Add Cgroup support for SGX EPC memory | expand |
On Tue Jan 30, 2024 at 4:09 AM EET, Haitao Huang wrote: > From: Kristen Carlson Accardi <kristen@linux.intel.com> > > Implement the reclamation flow for cgroup, encapsulated in the top-level > function sgx_epc_cgroup_reclaim_pages(). It does a pre-order walk on its > subtree, and make calls to sgx_reclaim_pages() at each node passing in > the LRU of that node. It keeps track of total reclaimed pages, and pages > left to attempt. It stops the walk if desired number of pages are > attempted. > > In some contexts, e.g. page fault handling, only asynchronous > reclamation is allowed. Create a work-queue, corresponding work item and > function definitions to support the asynchronous reclamation. Both > synchronous and asynchronous flows invoke the same top level reclaim > function, and will be triggered later by sgx_epc_cgroup_try_charge() > when usage of the cgroup is at or near its limit. > > Co-developed-by: Sean Christopherson <sean.j.christopherson@intel.com> > Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com> > Signed-off-by: Kristen Carlson Accardi <kristen@linux.intel.com> > Co-developed-by: Haitao Huang <haitao.huang@linux.intel.com> > Signed-off-by: Haitao Huang <haitao.huang@linux.intel.com> > --- > V8: > - Remove alignment for substructure variables. (Jarkko) > > V7: > - Split this out from the big patch, #10 in V6. (Dave, Kai) > --- > arch/x86/kernel/cpu/sgx/epc_cgroup.c | 174 ++++++++++++++++++++++++++- > arch/x86/kernel/cpu/sgx/epc_cgroup.h | 3 + > 2 files changed, 176 insertions(+), 1 deletion(-) > > diff --git a/arch/x86/kernel/cpu/sgx/epc_cgroup.c b/arch/x86/kernel/cpu/sgx/epc_cgroup.c > index eac8548164de..8858a0850f8a 100644 > --- a/arch/x86/kernel/cpu/sgx/epc_cgroup.c > +++ b/arch/x86/kernel/cpu/sgx/epc_cgroup.c > @@ -7,9 +7,173 @@ > > static struct sgx_epc_cgroup epc_cg_root; > > +static struct workqueue_struct *sgx_epc_cg_wq; I'd document this with a comment. Missed this for "epc_eg_root", which should have the same treatment. Just brief 1-2 line thing, no need for prose here. Something to remind what it is later on. > + > +static inline u64 sgx_epc_cgroup_page_counter_read(struct sgx_epc_cgroup *epc_cg) > +{ > + return atomic64_read(&epc_cg->cg->res[MISC_CG_RES_SGX_EPC].usage) / PAGE_SIZE; > +} > + > +static inline u64 sgx_epc_cgroup_max_pages(struct sgx_epc_cgroup *epc_cg) > +{ > + return READ_ONCE(epc_cg->cg->res[MISC_CG_RES_SGX_EPC].max) / PAGE_SIZE; > +} > + > +/* > + * Get the lower bound of limits of a cgroup and its ancestors. Used in > + * sgx_epc_cgroup_reclaim_work_func() to determine if EPC usage of a cgroup is > + * over its limit or its ancestors' hence reclamation is needed. > + */ > +static inline u64 sgx_epc_cgroup_max_pages_to_root(struct sgx_epc_cgroup *epc_cg) > +{ > + struct misc_cg *i = epc_cg->cg; > + u64 m = U64_MAX; > + > + while (i) { > + m = min(m, READ_ONCE(i->res[MISC_CG_RES_SGX_EPC].max)); > + i = misc_cg_parent(i); > + } > + > + return m / PAGE_SIZE; > +} > + > /** > - * sgx_epc_cgroup_try_charge() - try to charge cgroup for a single EPC page > + * sgx_epc_cgroup_lru_empty() - check if a cgroup tree has no pages on its LRUs > + * @root: Root of the tree to check > * > + * Return: %true if all cgroups under the specified root have empty LRU lists. > + * Used to avoid livelocks due to a cgroup having a non-zero charge count but > + * no pages on its LRUs, e.g. due to a dead enclave waiting to be released or > + * because all pages in the cgroup are unreclaimable. > + */ > +bool sgx_epc_cgroup_lru_empty(struct misc_cg *root) > +{ > + struct cgroup_subsys_state *css_root; > + struct cgroup_subsys_state *pos; > + struct sgx_epc_cgroup *epc_cg; > + bool ret = true; > + > + /* > + * Caller ensure css_root ref acquired > + */ > + css_root = &root->css; > + > + rcu_read_lock(); > + css_for_each_descendant_pre(pos, css_root) { > + if (!css_tryget(pos)) > + break; > + > + rcu_read_unlock(); > + > + epc_cg = sgx_epc_cgroup_from_misc_cg(css_misc(pos)); > + > + spin_lock(&epc_cg->lru.lock); > + ret = list_empty(&epc_cg->lru.reclaimable); > + spin_unlock(&epc_cg->lru.lock); > + > + rcu_read_lock(); > + css_put(pos); > + if (!ret) > + break; > + } > + > + rcu_read_unlock(); > + > + return ret; > +} > + > +/** > + * sgx_epc_cgroup_reclaim_pages() - walk a cgroup tree and scan LRUs to reclaim pages > + * @root: Root of the tree to start walking > + * Return: Number of pages reclaimed. > + */ > +unsigned int sgx_epc_cgroup_reclaim_pages(struct misc_cg *root) > +{ > + /* > + * Attempting to reclaim only a few pages will often fail and is > + * inefficient, while reclaiming a huge number of pages can result in > + * soft lockups due to holding various locks for an extended duration. > + */ > + unsigned int nr_to_scan = SGX_NR_TO_SCAN; > + struct cgroup_subsys_state *css_root; > + struct cgroup_subsys_state *pos; > + struct sgx_epc_cgroup *epc_cg; > + unsigned int cnt; > + > + /* Caller ensure css_root ref acquired */ > + css_root = &root->css; > + > + cnt = 0; > + rcu_read_lock(); > + css_for_each_descendant_pre(pos, css_root) { > + if (!css_tryget(pos)) > + break; > + rcu_read_unlock(); > + > + epc_cg = sgx_epc_cgroup_from_misc_cg(css_misc(pos)); > + cnt += sgx_reclaim_pages(&epc_cg->lru, &nr_to_scan); > + > + rcu_read_lock(); > + css_put(pos); > + if (!nr_to_scan) > + break; > + } > + > + rcu_read_unlock(); > + return cnt; > +} > + > +/* > + * Scheduled by sgx_epc_cgroup_try_charge() to reclaim pages from the cgroup > + * when the cgroup is at/near its maximum capacity > + */ > +static void sgx_epc_cgroup_reclaim_work_func(struct work_struct *work) > +{ > + struct sgx_epc_cgroup *epc_cg; > + u64 cur, max; > + > + epc_cg = container_of(work, struct sgx_epc_cgroup, reclaim_work); > + > + for (;;) { > + max = sgx_epc_cgroup_max_pages_to_root(epc_cg); > + > + /* > + * Adjust the limit down by one page, the goal is to free up > + * pages for fault allocations, not to simply obey the limit. > + * Conditionally decrementing max also means the cur vs. max > + * check will correctly handle the case where both are zero. > + */ > + if (max) > + max--; > + > + /* > + * Unless the limit is extremely low, in which case forcing > + * reclaim will likely cause thrashing, force the cgroup to > + * reclaim at least once if it's operating *near* its maximum > + * limit by adjusting @max down by half the min reclaim size. > + * This work func is scheduled by sgx_epc_cgroup_try_charge > + * when it cannot directly reclaim due to being in an atomic > + * context, e.g. EPC allocation in a fault handler. Waiting > + * to reclaim until the cgroup is actually at its limit is less > + * performant as it means the faulting task is effectively > + * blocked until a worker makes its way through the global work > + * queue. > + */ > + if (max > SGX_NR_TO_SCAN * 2) > + max -= (SGX_NR_TO_SCAN / 2); > + > + cur = sgx_epc_cgroup_page_counter_read(epc_cg); > + > + if (cur <= max || sgx_epc_cgroup_lru_empty(epc_cg->cg)) > + break; > + > + /* Keep reclaiming until above condition is met. */ > + sgx_epc_cgroup_reclaim_pages(epc_cg->cg); > + } > +} > + > +/** > + * sgx_epc_cgroup_try_charge() - try to charge cgroup for a single EPC page > * @epc_cg: The EPC cgroup to be charged for the page. > * Return: > * * %0 - If successfully charged. > @@ -37,6 +201,7 @@ static void sgx_epc_cgroup_free(struct misc_cg *cg) > if (!epc_cg) > return; > > + cancel_work_sync(&epc_cg->reclaim_work); > kfree(epc_cg); > } > > @@ -49,6 +214,8 @@ const struct misc_res_ops sgx_epc_cgroup_ops = { > > static void sgx_epc_misc_init(struct misc_cg *cg, struct sgx_epc_cgroup *epc_cg) > { > + sgx_lru_init(&epc_cg->lru); > + INIT_WORK(&epc_cg->reclaim_work, sgx_epc_cgroup_reclaim_work_func); > cg->res[MISC_CG_RES_SGX_EPC].priv = epc_cg; > epc_cg->cg = cg; > } > @@ -68,6 +235,11 @@ static int sgx_epc_cgroup_alloc(struct misc_cg *cg) > > void sgx_epc_cgroup_init(void) > { > + sgx_epc_cg_wq = alloc_workqueue("sgx_epc_cg_wq", > + WQ_UNBOUND | WQ_FREEZABLE, > + WQ_UNBOUND_MAX_ACTIVE); > + BUG_ON(!sgx_epc_cg_wq); > + > misc_cg_set_ops(MISC_CG_RES_SGX_EPC, &sgx_epc_cgroup_ops); > sgx_epc_misc_init(misc_cg_root(), &epc_cg_root); > } > diff --git a/arch/x86/kernel/cpu/sgx/epc_cgroup.h b/arch/x86/kernel/cpu/sgx/epc_cgroup.h > index 6b664b4c321f..e3c6a08f0ee8 100644 > --- a/arch/x86/kernel/cpu/sgx/epc_cgroup.h > +++ b/arch/x86/kernel/cpu/sgx/epc_cgroup.h > @@ -34,6 +34,8 @@ static inline void sgx_epc_cgroup_init(void) { } > #else > struct sgx_epc_cgroup { > struct misc_cg *cg; > + struct sgx_epc_lru_list lru; > + struct work_struct reclaim_work; > }; > > static inline struct sgx_epc_cgroup *sgx_epc_cgroup_from_misc_cg(struct misc_cg *cg) > @@ -66,6 +68,7 @@ static inline void sgx_put_epc_cg(struct sgx_epc_cgroup *epc_cg) > > int sgx_epc_cgroup_try_charge(struct sgx_epc_cgroup *epc_cg); > void sgx_epc_cgroup_uncharge(struct sgx_epc_cgroup *epc_cg); > +bool sgx_epc_cgroup_lru_empty(struct misc_cg *root); > void sgx_epc_cgroup_init(void); > > #endif BR, Jarkko
diff --git a/arch/x86/kernel/cpu/sgx/epc_cgroup.c b/arch/x86/kernel/cpu/sgx/epc_cgroup.c index eac8548164de..8858a0850f8a 100644 --- a/arch/x86/kernel/cpu/sgx/epc_cgroup.c +++ b/arch/x86/kernel/cpu/sgx/epc_cgroup.c @@ -7,9 +7,173 @@ static struct sgx_epc_cgroup epc_cg_root; +static struct workqueue_struct *sgx_epc_cg_wq; + +static inline u64 sgx_epc_cgroup_page_counter_read(struct sgx_epc_cgroup *epc_cg) +{ + return atomic64_read(&epc_cg->cg->res[MISC_CG_RES_SGX_EPC].usage) / PAGE_SIZE; +} + +static inline u64 sgx_epc_cgroup_max_pages(struct sgx_epc_cgroup *epc_cg) +{ + return READ_ONCE(epc_cg->cg->res[MISC_CG_RES_SGX_EPC].max) / PAGE_SIZE; +} + +/* + * Get the lower bound of limits of a cgroup and its ancestors. Used in + * sgx_epc_cgroup_reclaim_work_func() to determine if EPC usage of a cgroup is + * over its limit or its ancestors' hence reclamation is needed. + */ +static inline u64 sgx_epc_cgroup_max_pages_to_root(struct sgx_epc_cgroup *epc_cg) +{ + struct misc_cg *i = epc_cg->cg; + u64 m = U64_MAX; + + while (i) { + m = min(m, READ_ONCE(i->res[MISC_CG_RES_SGX_EPC].max)); + i = misc_cg_parent(i); + } + + return m / PAGE_SIZE; +} + /** - * sgx_epc_cgroup_try_charge() - try to charge cgroup for a single EPC page + * sgx_epc_cgroup_lru_empty() - check if a cgroup tree has no pages on its LRUs + * @root: Root of the tree to check * + * Return: %true if all cgroups under the specified root have empty LRU lists. + * Used to avoid livelocks due to a cgroup having a non-zero charge count but + * no pages on its LRUs, e.g. due to a dead enclave waiting to be released or + * because all pages in the cgroup are unreclaimable. + */ +bool sgx_epc_cgroup_lru_empty(struct misc_cg *root) +{ + struct cgroup_subsys_state *css_root; + struct cgroup_subsys_state *pos; + struct sgx_epc_cgroup *epc_cg; + bool ret = true; + + /* + * Caller ensure css_root ref acquired + */ + css_root = &root->css; + + rcu_read_lock(); + css_for_each_descendant_pre(pos, css_root) { + if (!css_tryget(pos)) + break; + + rcu_read_unlock(); + + epc_cg = sgx_epc_cgroup_from_misc_cg(css_misc(pos)); + + spin_lock(&epc_cg->lru.lock); + ret = list_empty(&epc_cg->lru.reclaimable); + spin_unlock(&epc_cg->lru.lock); + + rcu_read_lock(); + css_put(pos); + if (!ret) + break; + } + + rcu_read_unlock(); + + return ret; +} + +/** + * sgx_epc_cgroup_reclaim_pages() - walk a cgroup tree and scan LRUs to reclaim pages + * @root: Root of the tree to start walking + * Return: Number of pages reclaimed. + */ +unsigned int sgx_epc_cgroup_reclaim_pages(struct misc_cg *root) +{ + /* + * Attempting to reclaim only a few pages will often fail and is + * inefficient, while reclaiming a huge number of pages can result in + * soft lockups due to holding various locks for an extended duration. + */ + unsigned int nr_to_scan = SGX_NR_TO_SCAN; + struct cgroup_subsys_state *css_root; + struct cgroup_subsys_state *pos; + struct sgx_epc_cgroup *epc_cg; + unsigned int cnt; + + /* Caller ensure css_root ref acquired */ + css_root = &root->css; + + cnt = 0; + rcu_read_lock(); + css_for_each_descendant_pre(pos, css_root) { + if (!css_tryget(pos)) + break; + rcu_read_unlock(); + + epc_cg = sgx_epc_cgroup_from_misc_cg(css_misc(pos)); + cnt += sgx_reclaim_pages(&epc_cg->lru, &nr_to_scan); + + rcu_read_lock(); + css_put(pos); + if (!nr_to_scan) + break; + } + + rcu_read_unlock(); + return cnt; +} + +/* + * Scheduled by sgx_epc_cgroup_try_charge() to reclaim pages from the cgroup + * when the cgroup is at/near its maximum capacity + */ +static void sgx_epc_cgroup_reclaim_work_func(struct work_struct *work) +{ + struct sgx_epc_cgroup *epc_cg; + u64 cur, max; + + epc_cg = container_of(work, struct sgx_epc_cgroup, reclaim_work); + + for (;;) { + max = sgx_epc_cgroup_max_pages_to_root(epc_cg); + + /* + * Adjust the limit down by one page, the goal is to free up + * pages for fault allocations, not to simply obey the limit. + * Conditionally decrementing max also means the cur vs. max + * check will correctly handle the case where both are zero. + */ + if (max) + max--; + + /* + * Unless the limit is extremely low, in which case forcing + * reclaim will likely cause thrashing, force the cgroup to + * reclaim at least once if it's operating *near* its maximum + * limit by adjusting @max down by half the min reclaim size. + * This work func is scheduled by sgx_epc_cgroup_try_charge + * when it cannot directly reclaim due to being in an atomic + * context, e.g. EPC allocation in a fault handler. Waiting + * to reclaim until the cgroup is actually at its limit is less + * performant as it means the faulting task is effectively + * blocked until a worker makes its way through the global work + * queue. + */ + if (max > SGX_NR_TO_SCAN * 2) + max -= (SGX_NR_TO_SCAN / 2); + + cur = sgx_epc_cgroup_page_counter_read(epc_cg); + + if (cur <= max || sgx_epc_cgroup_lru_empty(epc_cg->cg)) + break; + + /* Keep reclaiming until above condition is met. */ + sgx_epc_cgroup_reclaim_pages(epc_cg->cg); + } +} + +/** + * sgx_epc_cgroup_try_charge() - try to charge cgroup for a single EPC page * @epc_cg: The EPC cgroup to be charged for the page. * Return: * * %0 - If successfully charged. @@ -37,6 +201,7 @@ static void sgx_epc_cgroup_free(struct misc_cg *cg) if (!epc_cg) return; + cancel_work_sync(&epc_cg->reclaim_work); kfree(epc_cg); } @@ -49,6 +214,8 @@ const struct misc_res_ops sgx_epc_cgroup_ops = { static void sgx_epc_misc_init(struct misc_cg *cg, struct sgx_epc_cgroup *epc_cg) { + sgx_lru_init(&epc_cg->lru); + INIT_WORK(&epc_cg->reclaim_work, sgx_epc_cgroup_reclaim_work_func); cg->res[MISC_CG_RES_SGX_EPC].priv = epc_cg; epc_cg->cg = cg; } @@ -68,6 +235,11 @@ static int sgx_epc_cgroup_alloc(struct misc_cg *cg) void sgx_epc_cgroup_init(void) { + sgx_epc_cg_wq = alloc_workqueue("sgx_epc_cg_wq", + WQ_UNBOUND | WQ_FREEZABLE, + WQ_UNBOUND_MAX_ACTIVE); + BUG_ON(!sgx_epc_cg_wq); + misc_cg_set_ops(MISC_CG_RES_SGX_EPC, &sgx_epc_cgroup_ops); sgx_epc_misc_init(misc_cg_root(), &epc_cg_root); } diff --git a/arch/x86/kernel/cpu/sgx/epc_cgroup.h b/arch/x86/kernel/cpu/sgx/epc_cgroup.h index 6b664b4c321f..e3c6a08f0ee8 100644 --- a/arch/x86/kernel/cpu/sgx/epc_cgroup.h +++ b/arch/x86/kernel/cpu/sgx/epc_cgroup.h @@ -34,6 +34,8 @@ static inline void sgx_epc_cgroup_init(void) { } #else struct sgx_epc_cgroup { struct misc_cg *cg; + struct sgx_epc_lru_list lru; + struct work_struct reclaim_work; }; static inline struct sgx_epc_cgroup *sgx_epc_cgroup_from_misc_cg(struct misc_cg *cg) @@ -66,6 +68,7 @@ static inline void sgx_put_epc_cg(struct sgx_epc_cgroup *epc_cg) int sgx_epc_cgroup_try_charge(struct sgx_epc_cgroup *epc_cg); void sgx_epc_cgroup_uncharge(struct sgx_epc_cgroup *epc_cg); +bool sgx_epc_cgroup_lru_empty(struct misc_cg *root); void sgx_epc_cgroup_init(void); #endif