@@ -984,6 +984,11 @@ static inline void __mmput(struct mm_struct *mm)
}
if (mm->binfmt)
module_put(mm->binfmt->module);
+ if (unlikely(mm_is_oom_victim(mm))) {
+ mutex_lock(&oom_lock);
+ set_bit(MMF_OOM_SKIP, &mm->flags);
+ mutex_unlock(&oom_lock);
+ }
mmdrop(mm);
}
@@ -3075,19 +3075,17 @@ void exit_mmap(struct mm_struct *mm)
__oom_reap_task_mm(mm);
/*
- * Now, set MMF_UNSTABLE to avoid racing with the oom reaper.
- * This needs to be done before calling munlock_vma_pages_all(),
- * which clears VM_LOCKED, otherwise the oom reaper cannot
- * reliably test for it. If the oom reaper races with
- * munlock_vma_pages_all(), this can result in a kernel oops if
- * a pmd is zapped, for example, after follow_page_mask() has
- * checked pmd_none().
+ * Wait for the oom reaper to complete. This needs to be done
+ * before calling munlock_vma_pages_all(), which clears
+ * VM_LOCKED, otherwise the oom reaper cannot reliably test for
+ * it. If the oom reaper races with munlock_vma_pages_all(),
+ * this can result in a kernel oops if a pmd is zapped, for
+ * example, after follow_page_mask() has checked pmd_none().
*
- * Taking mm->mmap_sem for write after setting MMF_UNSTABLE will
- * guarantee that the oom reaper will not run on this mm again
- * after mmap_sem is dropped.
+ * Taking mm->mmap_sem for write will guarantee that the oom
+ * reaper will not run on this mm again after mmap_sem is
+ * dropped.
*/
- set_bit(MMF_UNSTABLE, &mm->flags);
down_write(&mm->mmap_sem);
up_write(&mm->mmap_sem);
}
@@ -3115,7 +3113,6 @@ void exit_mmap(struct mm_struct *mm)
unmap_vmas(&tlb, vma, 0, -1);
free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);
tlb_finish_mmu(&tlb, 0, -1);
- set_bit(MMF_OOM_SKIP, &mm->flags);
/*
* Walk the list again, actually closing and freeing it,
@@ -488,11 +488,9 @@ void __oom_reap_task_mm(struct mm_struct *mm)
* Tell all users of get_user/copy_from_user etc... that the content
* is no longer stable. No barriers really needed because unmapping
* should imply barriers already and the reader would hit a page fault
- * if it stumbled over a reaped memory. If MMF_UNSTABLE is already set,
- * reaping as already occurred so nothing left to do.
+ * if it stumbled over a reaped memory.
*/
- if (test_and_set_bit(MMF_UNSTABLE, &mm->flags))
- return;
+ set_bit(MMF_UNSTABLE, &mm->flags);
for (vma = mm->mmap ; vma; vma = vma->vm_next) {
if (!can_madv_dontneed_vma(vma))
@@ -524,25 +522,9 @@ void __oom_reap_task_mm(struct mm_struct *mm)
static void oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
{
- /*
- * We have to make sure to not race with the victim exit path
- * and cause premature new oom victim selection:
- * oom_reap_task_mm exit_mm
- * mmget_not_zero
- * mmput
- * atomic_dec_and_test
- * exit_oom_victim
- * [...]
- * out_of_memory
- * select_bad_process
- * # no TIF_MEMDIE task selects new victim
- * unmap_page_range # frees some memory
- */
- mutex_lock(&oom_lock);
-
if (!down_read_trylock(&mm->mmap_sem)) {
trace_skip_task_reaping(tsk->pid);
- goto out_oom;
+ return;
}
/*
@@ -555,10 +537,18 @@ static void oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
goto out_mm;
/*
- * MMF_UNSTABLE is set by exit_mmap when the OOM reaper can't
- * work on the mm anymore. The check for MMF_UNSTABLE must run
- * under mmap_sem for reading because it serializes against the
- * down_write();up_write() cycle in exit_mmap().
+ * MMF_UNSTABLE is set by the time exit_mmap() calls
+ * munlock_vma_pages_all() in order to avoid race condition. The check
+ * for MMF_UNSTABLE must run under mmap_sem for reading because it
+ * serializes against the down_write();up_write() cycle in exit_mmap().
+ *
+ * However, since MMF_UNSTABLE is set by __oom_reap_task_mm() from
+ * exit_mmap() before start reaping (because the purpose of
+ * MMF_UNSTABLE is to "tell all users of get_user/copy_from_user etc...
+ * that the content is no longer stable"), it cannot be used for a flag
+ * for indicating that the OOM reaper can't work on the mm anymore.
+ * The OOM reaper will give up after (by default) 1 second even if
+ * exit_mmap() is doing __oom_reap_task_mm().
*/
if (test_bit(MMF_UNSTABLE, &mm->flags)) {
trace_skip_task_reaping(tsk->pid);
@@ -576,8 +566,6 @@ static void oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
K(get_mm_counter(mm, MM_SHMEMPAGES)));
out_mm:
up_read(&mm->mmap_sem);
-out_oom:
- mutex_unlock(&oom_lock);
}
static void oom_reap_task(struct task_struct *tsk)
@@ -591,12 +579,7 @@ static void oom_reap_task(struct task_struct *tsk)
if (test_bit(MMF_OOM_SKIP, &mm->flags))
goto drop;
- /*
- * If this mm has already been reaped, doing so again will not likely
- * free additional memory.
- */
- if (!test_bit(MMF_UNSTABLE, &mm->flags))
- oom_reap_task_mm(tsk, mm);
+ oom_reap_task_mm(tsk, mm);
if (time_after_eq(jiffies, mm->oom_free_expire)) {
if (!test_bit(MMF_OOM_SKIP, &mm->flags)) {
@@ -658,12 +641,16 @@ static int oom_reaper(void *unused)
static u64 oom_free_timeout_ms = 1000;
static void wake_oom_reaper(struct task_struct *tsk)
{
+ unsigned long expire = jiffies + msecs_to_jiffies(oom_free_timeout_ms);
+
+ /* expire must not be 0 in order to avoid double list_add(). */
+ if (!expire)
+ expire++;
/*
* Set the reap timeout; if it's already set, the mm is enqueued and
* this tsk can be ignored.
*/
- if (cmpxchg(&tsk->signal->oom_mm->oom_free_expire, 0UL,
- jiffies + msecs_to_jiffies(oom_free_timeout_ms)))
+ if (cmpxchg(&tsk->signal->oom_mm->oom_free_expire, 0UL, expire))
return;
get_task_struct(tsk);