@@ -101,6 +101,7 @@ extern unsigned long oom_badness(struct task_struct *p,
struct mem_cgroup *memcg, const nodemask_t *nodemask,
unsigned long totalpages);
+extern unsigned long try_oom_notifier(void);
extern bool out_of_memory(struct oom_control *oc);
extern void exit_oom_victim(void);
@@ -871,6 +871,36 @@ int unregister_oom_notifier(struct notifier_block *nb)
}
EXPORT_SYMBOL_GPL(unregister_oom_notifier);
+/**
+ * try_oom_notifier - Try to reclaim memory from OOM notifier list.
+ *
+ * Returns non-zero if notifier callbacks released something, zero otherwise.
+ */
+unsigned long try_oom_notifier(void)
+{
+ static DEFINE_MUTEX(oom_notifier_lock);
+ unsigned long freed = 0;
+
+ /*
+ * In order to protect OOM notifiers which are not thread safe and to
+ * avoid excessively releasing memory from OOM notifiers which release
+ * memory every time, this lock serializes/excludes concurrent calls to
+ * OOM notifiers.
+ */
+ if (!mutex_trylock(&oom_notifier_lock))
+ return 1;
+ /*
+ * But teach the lockdep that mutex_trylock() above acts like
+ * mutex_lock(), for we are not allowed to depend on
+ * __GFP_DIRECT_RECLAIM && !__GFP_NORETRY allocation here.
+ */
+ mutex_release(&oom_notifier_lock.dep_map, 1, _THIS_IP_);
+ mutex_acquire(&oom_notifier_lock.dep_map, 0, 0, _THIS_IP_);
+ blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
+ mutex_unlock(&oom_notifier_lock);
+ return freed;
+}
+
void exit_oom_mm(struct mm_struct *mm)
{
struct task_struct *p, *tmp;
@@ -937,19 +967,11 @@ static bool oom_has_pending_victims(struct oom_control *oc)
*/
bool out_of_memory(struct oom_control *oc)
{
- unsigned long freed = 0;
enum oom_constraint constraint = CONSTRAINT_NONE;
if (oom_killer_disabled)
return false;
- if (!is_memcg_oom(oc)) {
- blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
- if (freed > 0)
- /* Got some memory back in the last second. */
- return true;
- }
-
/*
* If current has a pending SIGKILL or is exiting, then automatically
* select it. The goal is to allow it to allocate so that it may
@@ -3447,10 +3447,50 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
return page;
}
+static inline bool can_oomkill(gfp_t gfp_mask, unsigned int order,
+ const struct alloc_context *ac)
+{
+ /* Coredumps can quickly deplete all memory reserves */
+ if (current->flags & PF_DUMPCORE)
+ return false;
+ /* The OOM killer will not help higher order allocs */
+ if (order > PAGE_ALLOC_COSTLY_ORDER)
+ return false;
+ /*
+ * We have already exhausted all our reclaim opportunities without any
+ * success so it is time to admit defeat. We will skip the OOM killer
+ * because it is very likely that the caller has a more reasonable
+ * fallback than shooting a random task.
+ */
+ if (gfp_mask & __GFP_RETRY_MAYFAIL)
+ return false;
+ /* The OOM killer does not needlessly kill tasks for lowmem */
+ if (ac->high_zoneidx < ZONE_NORMAL)
+ return false;
+ if (pm_suspended_storage())
+ return false;
+ /*
+ * XXX: GFP_NOFS allocations should rather fail than rely on
+ * other request to make a forward progress.
+ * We are in an unfortunate situation where out_of_memory cannot
+ * do much for this context but let's try it to at least get
+ * access to memory reserved if the current task is killed (see
+ * out_of_memory). Once filesystems are ready to handle allocation
+ * failures more gracefully we should just bail out here.
+ */
+
+ /* The OOM killer may not free memory on a specific node */
+ if (gfp_mask & __GFP_THISNODE)
+ return false;
+
+ return true;
+}
+
static inline struct page *
__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
const struct alloc_context *ac, unsigned long *did_some_progress)
{
+ const bool oomkill = can_oomkill(gfp_mask, order, ac);
struct oom_control oc = {
.zonelist = ac->zonelist,
.nodemask = ac->nodemask,
@@ -3462,6 +3502,10 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
*did_some_progress = 0;
+ /* Try to reclaim via OOM notifier callback. */
+ if (oomkill)
+ *did_some_progress = try_oom_notifier();
+
/*
* Acquire the oom lock. If that fails, somebody else is
* making progress for us.
@@ -3484,37 +3528,7 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
if (page)
goto out;
- /* Coredumps can quickly deplete all memory reserves */
- if (current->flags & PF_DUMPCORE)
- goto out;
- /* The OOM killer will not help higher order allocs */
- if (order > PAGE_ALLOC_COSTLY_ORDER)
- goto out;
- /*
- * We have already exhausted all our reclaim opportunities without any
- * success so it is time to admit defeat. We will skip the OOM killer
- * because it is very likely that the caller has a more reasonable
- * fallback than shooting a random task.
- */
- if (gfp_mask & __GFP_RETRY_MAYFAIL)
- goto out;
- /* The OOM killer does not needlessly kill tasks for lowmem */
- if (ac->high_zoneidx < ZONE_NORMAL)
- goto out;
- if (pm_suspended_storage())
- goto out;
- /*
- * XXX: GFP_NOFS allocations should rather fail than rely on
- * other request to make a forward progress.
- * We are in an unfortunate situation where out_of_memory cannot
- * do much for this context but let's try it to at least get
- * access to memory reserved if the current task is killed (see
- * out_of_memory). Once filesystems are ready to handle allocation
- * failures more gracefully we should just bail out here.
- */
-
- /* The OOM killer may not free memory on a specific node */
- if (gfp_mask & __GFP_THISNODE)
+ if (!oomkill)
goto out;
/* Exhausted what can be done so it's blame time */
Since blocking_notifier_call_chain() in out_of_memory() might sleep, sleeping with oom_lock held is currently an unavoidable problem. As a preparation for not to sleep with oom_lock held, this patch brings OOM notifier callbacks to outside of oom_lock. We are planning to eventually replace OOM notifier callbacks with different mechanisms (e.g. shrinker API). But such changes are out of scope for this series. Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp> Cc: Roman Gushchin <guro@fb.com> Cc: Michal Hocko <mhocko@suse.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Vladimir Davydov <vdavydov.dev@gmail.com> Cc: David Rientjes <rientjes@google.com> Cc: Tejun Heo <tj@kernel.org> --- include/linux/oom.h | 1 + mm/oom_kill.c | 38 +++++++++++++++++++++------ mm/page_alloc.c | 76 +++++++++++++++++++++++++++++++---------------------- 3 files changed, 76 insertions(+), 39 deletions(-)