diff mbox series

[v4] mm: vmpressure: don't count proactive reclaim in vmpressure

Message ID 20220714064918.2576464-1-yosryahmed@google.com (mailing list archive)
State New
Headers show
Series [v4] mm: vmpressure: don't count proactive reclaim in vmpressure | expand

Commit Message

Yosry Ahmed July 14, 2022, 6:49 a.m. UTC
vmpressure is used in cgroup v1 to notify userspace of reclaim
efficiency events, and is also used in both cgroup v1 and v2 as a signal
for memory pressure for networking, see
mem_cgroup_under_socket_pressure().

Proactive reclaim intends to probe memcgs for cold memory, without
affecting their performance. Hence, reclaim caused by writing to
memory.reclaim should not trigger vmpressure.

Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
Acked-by: Shakeel Butt <shakeelb@google.com>
---
Changes in v4:
- Removed unneeded reclaim_options local variables (Andrew).

Changes in v3:
- Limited the vmpressure change to memory.reclaim, dropped psi changes,
  updated changelog to reflect new behavior (Michal, Shakeel)

Changes in v2:
- Removed unnecessary initializations to zero (Andrew).
- Separate declarations and initializations when it causes line wrapping
  (Andrew).

---
 include/linux/swap.h |  5 ++++-
 mm/memcontrol.c      | 24 ++++++++++++++----------
 mm/vmscan.c          | 27 +++++++++++++++++----------
 3 files changed, 35 insertions(+), 21 deletions(-)

Comments

Yosry Ahmed July 19, 2022, 5:32 p.m. UTC | #1
On Wed, Jul 13, 2022 at 11:49 PM Yosry Ahmed <yosryahmed@google.com> wrote:
>
> vmpressure is used in cgroup v1 to notify userspace of reclaim
> efficiency events, and is also used in both cgroup v1 and v2 as a signal
> for memory pressure for networking, see
> mem_cgroup_under_socket_pressure().
>
> Proactive reclaim intends to probe memcgs for cold memory, without
> affecting their performance. Hence, reclaim caused by writing to
> memory.reclaim should not trigger vmpressure.
>
> Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
> Acked-by: Shakeel Butt <shakeelb@google.com>
> ---
> Changes in v4:
> - Removed unneeded reclaim_options local variables (Andrew).
>
> Changes in v3:
> - Limited the vmpressure change to memory.reclaim, dropped psi changes,
>   updated changelog to reflect new behavior (Michal, Shakeel)
>
> Changes in v2:
> - Removed unnecessary initializations to zero (Andrew).
> - Separate declarations and initializations when it causes line wrapping
>   (Andrew).
>
> ---
>  include/linux/swap.h |  5 ++++-
>  mm/memcontrol.c      | 24 ++++++++++++++----------
>  mm/vmscan.c          | 27 +++++++++++++++++----------
>  3 files changed, 35 insertions(+), 21 deletions(-)
>
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index 0c0fed1b348f..f6e9eaa2339f 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -411,10 +411,13 @@ extern void lru_cache_add_inactive_or_unevictable(struct page *page,
>  extern unsigned long zone_reclaimable_pages(struct zone *zone);
>  extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
>                                         gfp_t gfp_mask, nodemask_t *mask);
> +
> +#define MEMCG_RECLAIM_MAY_SWAP (1 << 1)
> +#define MEMCG_RECLAIM_PROACTIVE (1 << 2)
>  extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
>                                                   unsigned long nr_pages,
>                                                   gfp_t gfp_mask,
> -                                                 bool may_swap);
> +                                                 unsigned int reclaim_options);
>  extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem,
>                                                 gfp_t gfp_mask, bool noswap,
>                                                 pg_data_t *pgdat,
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index a550042d88c3..b668224142c7 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -2331,7 +2331,8 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg,
>
>                 psi_memstall_enter(&pflags);
>                 nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
> -                                                            gfp_mask, true);
> +                                                       gfp_mask,
> +                                                       MEMCG_RECLAIM_MAY_SWAP);
>                 psi_memstall_leave(&pflags);
>         } while ((memcg = parent_mem_cgroup(memcg)) &&
>                  !mem_cgroup_is_root(memcg));
> @@ -2576,7 +2577,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
>         struct page_counter *counter;
>         unsigned long nr_reclaimed;
>         bool passed_oom = false;
> -       bool may_swap = true;
> +       unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP;
>         bool drained = false;
>         unsigned long pflags;
>
> @@ -2593,7 +2594,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
>                 mem_over_limit = mem_cgroup_from_counter(counter, memory);
>         } else {
>                 mem_over_limit = mem_cgroup_from_counter(counter, memsw);
> -               may_swap = false;
> +               reclaim_options &= ~MEMCG_RECLAIM_MAY_SWAP;
>         }
>
>         if (batch > nr_pages) {
> @@ -2620,7 +2621,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
>
>         psi_memstall_enter(&pflags);
>         nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
> -                                                   gfp_mask, may_swap);
> +                                                   gfp_mask, reclaim_options);
>         psi_memstall_leave(&pflags);
>
>         if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
> @@ -3402,8 +3403,8 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
>                         continue;
>                 }
>
> -               if (!try_to_free_mem_cgroup_pages(memcg, 1,
> -                                       GFP_KERNEL, !memsw)) {
> +               if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
> +                                       memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP)) {
>                         ret = -EBUSY;
>                         break;
>                 }
> @@ -3513,7 +3514,8 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
>                 if (signal_pending(current))
>                         return -EINTR;
>
> -               if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true))
> +               if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
> +                                                 MEMCG_RECLAIM_MAY_SWAP))
>                         nr_retries--;
>         }
>
> @@ -6248,7 +6250,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
>                 }
>
>                 reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
> -                                                        GFP_KERNEL, true);
> +                                       GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP);
>
>                 if (!reclaimed && !nr_retries--)
>                         break;
> @@ -6297,7 +6299,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
>
>                 if (nr_reclaims) {
>                         if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
> -                                                         GFP_KERNEL, true))
> +                                       GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP))
>                                 nr_reclaims--;
>                         continue;
>                 }
> @@ -6426,6 +6428,7 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
>         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
>         unsigned int nr_retries = MAX_RECLAIM_RETRIES;
>         unsigned long nr_to_reclaim, nr_reclaimed = 0;
> +       unsigned int reclaim_options;
>         int err;
>
>         buf = strstrip(buf);
> @@ -6433,6 +6436,7 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
>         if (err)
>                 return err;
>
> +       reclaim_options = MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE;
>         while (nr_reclaimed < nr_to_reclaim) {
>                 unsigned long reclaimed;
>
> @@ -6449,7 +6453,7 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
>
>                 reclaimed = try_to_free_mem_cgroup_pages(memcg,
>                                                 nr_to_reclaim - nr_reclaimed,
> -                                               GFP_KERNEL, true);
> +                                               GFP_KERNEL, reclaim_options);
>
>                 if (!reclaimed && !nr_retries--)
>                         return -EAGAIN;
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index f7d9a683e3a7..0969e6408a53 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -102,6 +102,9 @@ struct scan_control {
>         /* Can pages be swapped as part of reclaim? */
>         unsigned int may_swap:1;
>
> +       /* Proactive reclaim invoked by userspace through memory.reclaim */
> +       unsigned int proactive:1;
> +
>         /*
>          * Cgroup memory below memory.low is protected as long as we
>          * don't threaten to OOM. If any cgroup is reclaimed at
> @@ -3125,9 +3128,10 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
>                             sc->priority);
>
>                 /* Record the group's reclaim efficiency */
> -               vmpressure(sc->gfp_mask, memcg, false,
> -                          sc->nr_scanned - scanned,
> -                          sc->nr_reclaimed - reclaimed);
> +               if (!sc->proactive)
> +                       vmpressure(sc->gfp_mask, memcg, false,
> +                                  sc->nr_scanned - scanned,
> +                                  sc->nr_reclaimed - reclaimed);
>
>         } while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL)));
>  }
> @@ -3250,9 +3254,10 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
>         }
>
>         /* Record the subtree's reclaim efficiency */
> -       vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
> -                  sc->nr_scanned - nr_scanned,
> -                  sc->nr_reclaimed - nr_reclaimed);
> +       if (!sc->proactive)
> +               vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
> +                          sc->nr_scanned - nr_scanned,
> +                          sc->nr_reclaimed - nr_reclaimed);
>
>         if (sc->nr_reclaimed - nr_reclaimed)
>                 reclaimable = true;
> @@ -3534,8 +3539,9 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
>                 __count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1);
>
>         do {
> -               vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
> -                               sc->priority);
> +               if (!sc->proactive)
> +                       vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
> +                                       sc->priority);
>                 sc->nr_scanned = 0;
>                 shrink_zones(zonelist, sc);
>
> @@ -3825,7 +3831,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
>  unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
>                                            unsigned long nr_pages,
>                                            gfp_t gfp_mask,
> -                                          bool may_swap)
> +                                          unsigned int reclaim_options)
>  {
>         unsigned long nr_reclaimed;
>         unsigned int noreclaim_flag;
> @@ -3838,7 +3844,8 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
>                 .priority = DEF_PRIORITY,
>                 .may_writepage = !laptop_mode,
>                 .may_unmap = 1,
> -               .may_swap = may_swap,
> +               .may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP),
> +               .proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE),
>         };
>         /*
>          * Traverse the ZONELIST_FALLBACK zonelist of the current node to put
> --
> 2.37.0.144.g8ac04bfd2-goog
>

Hey Michal,

Any outstanding comments/thoughts on this version?
Michal Hocko July 20, 2022, 9:24 a.m. UTC | #2
On Thu 14-07-22 06:49:18, Yosry Ahmed wrote:
> vmpressure is used in cgroup v1 to notify userspace of reclaim
> efficiency events, and is also used in both cgroup v1 and v2 as a signal
> for memory pressure for networking, see
> mem_cgroup_under_socket_pressure().
> 
> Proactive reclaim intends to probe memcgs for cold memory, without
> affecting their performance. Hence, reclaim caused by writing to
> memory.reclaim should not trigger vmpressure.

I am not against the change but this is rather vague statement. Please
be more specific. You are not really explaining what kind of side effect
this does and why that is really desirable. Sure pro-active reclaim can
be used to probe for a cold memory but it can also be used to balance
the memory consumption or other potential usecases.

I think what we are missing here is
- explain that this doesn't have any effect on existing users of
  vmpressure user interface because that is cgroup v1 and memory.reclaim
  is v2 feature. This is a trivial statement but quite useful for future
  readers of this commit
- explain the effect on the networking layer and typical usecases
  memory.reclaim is used for currently and ideally document that.
- how are we going to deal with users who would really want to use
  memory.reclaim interface as a replacement for existing hard/high
  memory reclaim? Is that even something that the interface is intended
  for?

> Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
> Acked-by: Shakeel Butt <shakeelb@google.com>

The patch itself looks good to me.

> ---
> Changes in v4:
> - Removed unneeded reclaim_options local variables (Andrew).
> 
> Changes in v3:
> - Limited the vmpressure change to memory.reclaim, dropped psi changes,
>   updated changelog to reflect new behavior (Michal, Shakeel)
> 
> Changes in v2:
> - Removed unnecessary initializations to zero (Andrew).
> - Separate declarations and initializations when it causes line wrapping
>   (Andrew).
> 
> ---
>  include/linux/swap.h |  5 ++++-
>  mm/memcontrol.c      | 24 ++++++++++++++----------
>  mm/vmscan.c          | 27 +++++++++++++++++----------
>  3 files changed, 35 insertions(+), 21 deletions(-)
> 
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index 0c0fed1b348f..f6e9eaa2339f 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -411,10 +411,13 @@ extern void lru_cache_add_inactive_or_unevictable(struct page *page,
>  extern unsigned long zone_reclaimable_pages(struct zone *zone);
>  extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
>  					gfp_t gfp_mask, nodemask_t *mask);
> +
> +#define MEMCG_RECLAIM_MAY_SWAP (1 << 1)
> +#define MEMCG_RECLAIM_PROACTIVE (1 << 2)
>  extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
>  						  unsigned long nr_pages,
>  						  gfp_t gfp_mask,
> -						  bool may_swap);
> +						  unsigned int reclaim_options);
>  extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem,
>  						gfp_t gfp_mask, bool noswap,
>  						pg_data_t *pgdat,
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index a550042d88c3..b668224142c7 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -2331,7 +2331,8 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg,
>  
>  		psi_memstall_enter(&pflags);
>  		nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
> -							     gfp_mask, true);
> +							gfp_mask,
> +							MEMCG_RECLAIM_MAY_SWAP);
>  		psi_memstall_leave(&pflags);
>  	} while ((memcg = parent_mem_cgroup(memcg)) &&
>  		 !mem_cgroup_is_root(memcg));
> @@ -2576,7 +2577,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
>  	struct page_counter *counter;
>  	unsigned long nr_reclaimed;
>  	bool passed_oom = false;
> -	bool may_swap = true;
> +	unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP;
>  	bool drained = false;
>  	unsigned long pflags;
>  
> @@ -2593,7 +2594,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
>  		mem_over_limit = mem_cgroup_from_counter(counter, memory);
>  	} else {
>  		mem_over_limit = mem_cgroup_from_counter(counter, memsw);
> -		may_swap = false;
> +		reclaim_options &= ~MEMCG_RECLAIM_MAY_SWAP;
>  	}
>  
>  	if (batch > nr_pages) {
> @@ -2620,7 +2621,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
>  
>  	psi_memstall_enter(&pflags);
>  	nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
> -						    gfp_mask, may_swap);
> +						    gfp_mask, reclaim_options);
>  	psi_memstall_leave(&pflags);
>  
>  	if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
> @@ -3402,8 +3403,8 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
>  			continue;
>  		}
>  
> -		if (!try_to_free_mem_cgroup_pages(memcg, 1,
> -					GFP_KERNEL, !memsw)) {
> +		if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
> +					memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP)) {
>  			ret = -EBUSY;
>  			break;
>  		}
> @@ -3513,7 +3514,8 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
>  		if (signal_pending(current))
>  			return -EINTR;
>  
> -		if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true))
> +		if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
> +						  MEMCG_RECLAIM_MAY_SWAP))
>  			nr_retries--;
>  	}
>  
> @@ -6248,7 +6250,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
>  		}
>  
>  		reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
> -							 GFP_KERNEL, true);
> +					GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP);
>  
>  		if (!reclaimed && !nr_retries--)
>  			break;
> @@ -6297,7 +6299,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
>  
>  		if (nr_reclaims) {
>  			if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
> -							  GFP_KERNEL, true))
> +					GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP))
>  				nr_reclaims--;
>  			continue;
>  		}
> @@ -6426,6 +6428,7 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
>  	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
>  	unsigned int nr_retries = MAX_RECLAIM_RETRIES;
>  	unsigned long nr_to_reclaim, nr_reclaimed = 0;
> +	unsigned int reclaim_options;
>  	int err;
>  
>  	buf = strstrip(buf);
> @@ -6433,6 +6436,7 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
>  	if (err)
>  		return err;
>  
> +	reclaim_options	= MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE;
>  	while (nr_reclaimed < nr_to_reclaim) {
>  		unsigned long reclaimed;
>  
> @@ -6449,7 +6453,7 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
>  
>  		reclaimed = try_to_free_mem_cgroup_pages(memcg,
>  						nr_to_reclaim - nr_reclaimed,
> -						GFP_KERNEL, true);
> +						GFP_KERNEL, reclaim_options);
>  
>  		if (!reclaimed && !nr_retries--)
>  			return -EAGAIN;
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index f7d9a683e3a7..0969e6408a53 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -102,6 +102,9 @@ struct scan_control {
>  	/* Can pages be swapped as part of reclaim? */
>  	unsigned int may_swap:1;
>  
> +	/* Proactive reclaim invoked by userspace through memory.reclaim */
> +	unsigned int proactive:1;
> +
>  	/*
>  	 * Cgroup memory below memory.low is protected as long as we
>  	 * don't threaten to OOM. If any cgroup is reclaimed at
> @@ -3125,9 +3128,10 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
>  			    sc->priority);
>  
>  		/* Record the group's reclaim efficiency */
> -		vmpressure(sc->gfp_mask, memcg, false,
> -			   sc->nr_scanned - scanned,
> -			   sc->nr_reclaimed - reclaimed);
> +		if (!sc->proactive)
> +			vmpressure(sc->gfp_mask, memcg, false,
> +				   sc->nr_scanned - scanned,
> +				   sc->nr_reclaimed - reclaimed);
>  
>  	} while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL)));
>  }
> @@ -3250,9 +3254,10 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
>  	}
>  
>  	/* Record the subtree's reclaim efficiency */
> -	vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
> -		   sc->nr_scanned - nr_scanned,
> -		   sc->nr_reclaimed - nr_reclaimed);
> +	if (!sc->proactive)
> +		vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
> +			   sc->nr_scanned - nr_scanned,
> +			   sc->nr_reclaimed - nr_reclaimed);
>  
>  	if (sc->nr_reclaimed - nr_reclaimed)
>  		reclaimable = true;
> @@ -3534,8 +3539,9 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
>  		__count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1);
>  
>  	do {
> -		vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
> -				sc->priority);
> +		if (!sc->proactive)
> +			vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
> +					sc->priority);
>  		sc->nr_scanned = 0;
>  		shrink_zones(zonelist, sc);
>  
> @@ -3825,7 +3831,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
>  unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
>  					   unsigned long nr_pages,
>  					   gfp_t gfp_mask,
> -					   bool may_swap)
> +					   unsigned int reclaim_options)
>  {
>  	unsigned long nr_reclaimed;
>  	unsigned int noreclaim_flag;
> @@ -3838,7 +3844,8 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
>  		.priority = DEF_PRIORITY,
>  		.may_writepage = !laptop_mode,
>  		.may_unmap = 1,
> -		.may_swap = may_swap,
> +		.may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP),
> +		.proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE),
>  	};
>  	/*
>  	 * Traverse the ZONELIST_FALLBACK zonelist of the current node to put
> -- 
> 2.37.0.144.g8ac04bfd2-goog
Shakeel Butt July 20, 2022, 5:49 p.m. UTC | #3
On Wed, Jul 20, 2022 at 2:24 AM Michal Hocko <mhocko@suse.com> wrote:
>
[...]
>
> I think what we are missing here is
> - explain that this doesn't have any effect on existing users of
>   vmpressure user interface because that is cgroup v1 and memory.reclaim
>   is v2 feature. This is a trivial statement but quite useful for future
>   readers of this commit
> - explain the effect on the networking layer and typical usecases
>   memory.reclaim is used for currently and ideally document that.

I agree with the above two points (Yosry, please address those) but
the following third point is orthogonal and we don't really need to
have an answer for this patch to be accepted.

> - how are we going to deal with users who would really want to use
>   memory.reclaim interface as a replacement for existing hard/high
>   memory reclaim? Is that even something that the interface is intended
>   for?

I do agree that this question is important. Nowadays I am looking at
this from a different perspective and use-case. More concretely how
(and why) to replace vmpressure based network throttling for cgroup
v2. I will start a separate thread for that discussion.
Yosry Ahmed July 20, 2022, 6:02 p.m. UTC | #4
On Wed, Jul 20, 2022 at 10:50 AM Shakeel Butt <shakeelb@google.com> wrote:
>
> On Wed, Jul 20, 2022 at 2:24 AM Michal Hocko <mhocko@suse.com> wrote:
> >
> [...]
> >
> > I think what we are missing here is
> > - explain that this doesn't have any effect on existing users of
> >   vmpressure user interface because that is cgroup v1 and memory.reclaim
> >   is v2 feature. This is a trivial statement but quite useful for future
> >   readers of this commit
> > - explain the effect on the networking layer and typical usecases
> >   memory.reclaim is used for currently and ideally document that.
>
> I agree with the above two points (Yosry, please address those) but
> the following third point is orthogonal and we don't really need to
> have an answer for this patch to be accepted.
>

That's great feedback, thanks Michal and Shakeel!

How do you feel about the following commit message instead? Does it
address your concerns?:

memory.reclaim is a cgroup v2 interface that allows users to
proactively reclaim memory from a memcg, without real memory pressure.
Reclaim operations invoke vmpressure, which is used in cgroup v1 to
notify userspace of reclaim efficiency, and used in both v1 and v2 as
a signal for a memcg being under memory pressure for networking (see
mem_cgroup_under_socket_pressure()). For the former, vmpressure
notifications in v1 are not affected by this change since
memory.reclaim is a v2 feature.

For the latter, the effects of the vmpressure signal (according to
Shakeel [1]) are as follows:
1. Reducing send and receive buffers of the current socket.
2. May drop packets on the rx path.
3. May throttle current thread on the tx path.

Since proactive reclaim is invoked directly by userspace, not by
memory pressure, it makes sense not to throttle networking. Hence,
this change makes sure that proactive reclaim caused by memory.reclaim
does not trigger vmpressure.

[1] https://lore.kernel.org/lkml/CALvZod68WdrXEmBpOkadhB5GPYmCXaDZzXH=yyGOCAjFRn4NDQ@mail.gmail.com/

> > - how are we going to deal with users who would really want to use
> >   memory.reclaim interface as a replacement for existing hard/high
> >   memory reclaim? Is that even something that the interface is intended
> >   for?
>
> I do agree that this question is important. Nowadays I am looking at
> this from a different perspective and use-case. More concretely how
> (and why) to replace vmpressure based network throttling for cgroup
> v2. I will start a separate thread for that discussion.
Michal Hocko July 21, 2022, 11:42 a.m. UTC | #5
On Wed 20-07-22 10:49:53, Shakeel Butt wrote:
> On Wed, Jul 20, 2022 at 2:24 AM Michal Hocko <mhocko@suse.com> wrote:
> >
> [...]
> >
> > I think what we are missing here is
> > - explain that this doesn't have any effect on existing users of
> >   vmpressure user interface because that is cgroup v1 and memory.reclaim
> >   is v2 feature. This is a trivial statement but quite useful for future
> >   readers of this commit
> > - explain the effect on the networking layer and typical usecases
> >   memory.reclaim is used for currently and ideally document that.
> 
> I agree with the above two points (Yosry, please address those) but
> the following third point is orthogonal and we don't really need to
> have an answer for this patch to be accepted.
> 
> > - how are we going to deal with users who would really want to use
> >   memory.reclaim interface as a replacement for existing hard/high
> >   memory reclaim? Is that even something that the interface is intended
> >   for?
> 
> I do agree that this question is important. Nowadays I am looking at
> this from a different perspective and use-case. More concretely how
> (and why) to replace vmpressure based network throttling for cgroup
> v2. I will start a separate thread for that discussion.

I think we should be good to document this side effect for now. If you
have a plan to change to vmpressure based throttling then only better.
But one way or the other impact of the memory.reclaim interface on
netwroking should be documented properly.
Michal Hocko July 21, 2022, 11:44 a.m. UTC | #6
On Wed 20-07-22 11:02:56, Yosry Ahmed wrote:
> On Wed, Jul 20, 2022 at 10:50 AM Shakeel Butt <shakeelb@google.com> wrote:
> >
> > On Wed, Jul 20, 2022 at 2:24 AM Michal Hocko <mhocko@suse.com> wrote:
> > >
> > [...]
> > >
> > > I think what we are missing here is
> > > - explain that this doesn't have any effect on existing users of
> > >   vmpressure user interface because that is cgroup v1 and memory.reclaim
> > >   is v2 feature. This is a trivial statement but quite useful for future
> > >   readers of this commit
> > > - explain the effect on the networking layer and typical usecases
> > >   memory.reclaim is used for currently and ideally document that.
> >
> > I agree with the above two points (Yosry, please address those) but
> > the following third point is orthogonal and we don't really need to
> > have an answer for this patch to be accepted.
> >
> 
> That's great feedback, thanks Michal and Shakeel!
> 
> How do you feel about the following commit message instead? Does it
> address your concerns?:
> 
> memory.reclaim is a cgroup v2 interface that allows users to
> proactively reclaim memory from a memcg, without real memory pressure.
> Reclaim operations invoke vmpressure, which is used in cgroup v1 to
> notify userspace of reclaim efficiency, and used in both v1 and v2 as
> a signal for a memcg being under memory pressure for networking (see
> mem_cgroup_under_socket_pressure()). For the former, vmpressure
> notifications in v1 are not affected by this change since
> memory.reclaim is a v2 feature.
> 
> For the latter, the effects of the vmpressure signal (according to
> Shakeel [1]) are as follows:
> 1. Reducing send and receive buffers of the current socket.
> 2. May drop packets on the rx path.
> 3. May throttle current thread on the tx path.
> 
> Since proactive reclaim is invoked directly by userspace, not by
> memory pressure, it makes sense not to throttle networking. Hence,
> this change makes sure that proactive reclaim caused by memory.reclaim
> does not trigger vmpressure.

OK, looks much better. Please also add a note to the documentation about
this side effect.

Thanks!
Yosry Ahmed July 21, 2022, 3:58 p.m. UTC | #7
On Thu, Jul 21, 2022 at 4:44 AM Michal Hocko <mhocko@suse.com> wrote:
>
> On Wed 20-07-22 11:02:56, Yosry Ahmed wrote:
> > On Wed, Jul 20, 2022 at 10:50 AM Shakeel Butt <shakeelb@google.com> wrote:
> > >
> > > On Wed, Jul 20, 2022 at 2:24 AM Michal Hocko <mhocko@suse.com> wrote:
> > > >
> > > [...]
> > > >
> > > > I think what we are missing here is
> > > > - explain that this doesn't have any effect on existing users of
> > > >   vmpressure user interface because that is cgroup v1 and memory.reclaim
> > > >   is v2 feature. This is a trivial statement but quite useful for future
> > > >   readers of this commit
> > > > - explain the effect on the networking layer and typical usecases
> > > >   memory.reclaim is used for currently and ideally document that.
> > >
> > > I agree with the above two points (Yosry, please address those) but
> > > the following third point is orthogonal and we don't really need to
> > > have an answer for this patch to be accepted.
> > >
> >
> > That's great feedback, thanks Michal and Shakeel!
> >
> > How do you feel about the following commit message instead? Does it
> > address your concerns?:
> >
> > memory.reclaim is a cgroup v2 interface that allows users to
> > proactively reclaim memory from a memcg, without real memory pressure.
> > Reclaim operations invoke vmpressure, which is used in cgroup v1 to
> > notify userspace of reclaim efficiency, and used in both v1 and v2 as
> > a signal for a memcg being under memory pressure for networking (see
> > mem_cgroup_under_socket_pressure()). For the former, vmpressure
> > notifications in v1 are not affected by this change since
> > memory.reclaim is a v2 feature.
> >
> > For the latter, the effects of the vmpressure signal (according to
> > Shakeel [1]) are as follows:
> > 1. Reducing send and receive buffers of the current socket.
> > 2. May drop packets on the rx path.
> > 3. May throttle current thread on the tx path.
> >
> > Since proactive reclaim is invoked directly by userspace, not by
> > memory pressure, it makes sense not to throttle networking. Hence,
> > this change makes sure that proactive reclaim caused by memory.reclaim
> > does not trigger vmpressure.
>
> OK, looks much better. Please also add a note to the documentation about
> this side effect.

I don't want to add something to the documentation about throttling
networking because it seems like these are implementation details that
we may change in the future. I don't know if we can document this
behavior today and then change it later.

How about we document a more generic statement in memory.reclaim
documentation, like:

"With reactive reclaim operations triggered by the kernel, the kernel
may take further actions to alleviate memory pressure (such as
throttling networking memory consumption). For proactive reclaim
operations triggered by this interface, the kernel may choose to skip
such actions as reclaim is not an indication of memory pressure."

Does this make sense to you?

>
> Thanks!
> --
> Michal Hocko
> SUSE Labs
Michal Hocko July 21, 2022, 4:42 p.m. UTC | #8
On Thu 21-07-22 08:58:06, Yosry Ahmed wrote:
> On Thu, Jul 21, 2022 at 4:44 AM Michal Hocko <mhocko@suse.com> wrote:
> >
> > On Wed 20-07-22 11:02:56, Yosry Ahmed wrote:
> > > On Wed, Jul 20, 2022 at 10:50 AM Shakeel Butt <shakeelb@google.com> wrote:
> > > >
> > > > On Wed, Jul 20, 2022 at 2:24 AM Michal Hocko <mhocko@suse.com> wrote:
> > > > >
> > > > [...]
> > > > >
> > > > > I think what we are missing here is
> > > > > - explain that this doesn't have any effect on existing users of
> > > > >   vmpressure user interface because that is cgroup v1 and memory.reclaim
> > > > >   is v2 feature. This is a trivial statement but quite useful for future
> > > > >   readers of this commit
> > > > > - explain the effect on the networking layer and typical usecases
> > > > >   memory.reclaim is used for currently and ideally document that.
> > > >
> > > > I agree with the above two points (Yosry, please address those) but
> > > > the following third point is orthogonal and we don't really need to
> > > > have an answer for this patch to be accepted.
> > > >
> > >
> > > That's great feedback, thanks Michal and Shakeel!
> > >
> > > How do you feel about the following commit message instead? Does it
> > > address your concerns?:
> > >
> > > memory.reclaim is a cgroup v2 interface that allows users to
> > > proactively reclaim memory from a memcg, without real memory pressure.
> > > Reclaim operations invoke vmpressure, which is used in cgroup v1 to
> > > notify userspace of reclaim efficiency, and used in both v1 and v2 as
> > > a signal for a memcg being under memory pressure for networking (see
> > > mem_cgroup_under_socket_pressure()). For the former, vmpressure
> > > notifications in v1 are not affected by this change since
> > > memory.reclaim is a v2 feature.
> > >
> > > For the latter, the effects of the vmpressure signal (according to
> > > Shakeel [1]) are as follows:
> > > 1. Reducing send and receive buffers of the current socket.
> > > 2. May drop packets on the rx path.
> > > 3. May throttle current thread on the tx path.
> > >
> > > Since proactive reclaim is invoked directly by userspace, not by
> > > memory pressure, it makes sense not to throttle networking. Hence,
> > > this change makes sure that proactive reclaim caused by memory.reclaim
> > > does not trigger vmpressure.
> >
> > OK, looks much better. Please also add a note to the documentation about
> > this side effect.
> 
> I don't want to add something to the documentation about throttling
> networking because it seems like these are implementation details that
> we may change in the future. I don't know if we can document this
> behavior today and then change it later.

The exact mechanism on how the throttling is done is one thing. This can
change. But the fact that _no_ throttling is applied is something that
we shouldn't change of course. If we were really strict we shouldn't
change it even now but considering that the interface is new and
usecases still shaping then better now than later.

> How about we document a more generic statement in memory.reclaim
> documentation, like:
> 
> "With reactive reclaim operations triggered by the kernel, the kernel
> may take further actions to alleviate memory pressure (such as
> throttling networking memory consumption). For proactive reclaim
> operations triggered by this interface, the kernel may choose to skip
> such actions as reclaim is not an indication of memory pressure."

IDK, this sounds too much word lawyering to me TBH. It is better to be clear
about explicitly known side effects. For example where do shrinkers
stand in the light of above wording? Kernel can chose to do almost
anything and I do not think we want to control which shrinkers are
triggered and what they do.

So I would really prefer to say something like:
"
Please note that the proactive reclaim (triggered by this interface) is
not meant to indicate memory pressure on the memory cgroup. Therefore
socket memory balancing triggered by the memory reclaim normally is not
exercised in this case. This means that the networking layer will not
adapt based on reclaim induced by memory.reclaim.
"
Yosry Ahmed July 21, 2022, 4:49 p.m. UTC | #9
On Thu, Jul 21, 2022 at 9:42 AM Michal Hocko <mhocko@suse.com> wrote:
>
> On Thu 21-07-22 08:58:06, Yosry Ahmed wrote:
> > On Thu, Jul 21, 2022 at 4:44 AM Michal Hocko <mhocko@suse.com> wrote:
> > >
> > > On Wed 20-07-22 11:02:56, Yosry Ahmed wrote:
> > > > On Wed, Jul 20, 2022 at 10:50 AM Shakeel Butt <shakeelb@google.com> wrote:
> > > > >
> > > > > On Wed, Jul 20, 2022 at 2:24 AM Michal Hocko <mhocko@suse.com> wrote:
> > > > > >
> > > > > [...]
> > > > > >
> > > > > > I think what we are missing here is
> > > > > > - explain that this doesn't have any effect on existing users of
> > > > > >   vmpressure user interface because that is cgroup v1 and memory.reclaim
> > > > > >   is v2 feature. This is a trivial statement but quite useful for future
> > > > > >   readers of this commit
> > > > > > - explain the effect on the networking layer and typical usecases
> > > > > >   memory.reclaim is used for currently and ideally document that.
> > > > >
> > > > > I agree with the above two points (Yosry, please address those) but
> > > > > the following third point is orthogonal and we don't really need to
> > > > > have an answer for this patch to be accepted.
> > > > >
> > > >
> > > > That's great feedback, thanks Michal and Shakeel!
> > > >
> > > > How do you feel about the following commit message instead? Does it
> > > > address your concerns?:
> > > >
> > > > memory.reclaim is a cgroup v2 interface that allows users to
> > > > proactively reclaim memory from a memcg, without real memory pressure.
> > > > Reclaim operations invoke vmpressure, which is used in cgroup v1 to
> > > > notify userspace of reclaim efficiency, and used in both v1 and v2 as
> > > > a signal for a memcg being under memory pressure for networking (see
> > > > mem_cgroup_under_socket_pressure()). For the former, vmpressure
> > > > notifications in v1 are not affected by this change since
> > > > memory.reclaim is a v2 feature.
> > > >
> > > > For the latter, the effects of the vmpressure signal (according to
> > > > Shakeel [1]) are as follows:
> > > > 1. Reducing send and receive buffers of the current socket.
> > > > 2. May drop packets on the rx path.
> > > > 3. May throttle current thread on the tx path.
> > > >
> > > > Since proactive reclaim is invoked directly by userspace, not by
> > > > memory pressure, it makes sense not to throttle networking. Hence,
> > > > this change makes sure that proactive reclaim caused by memory.reclaim
> > > > does not trigger vmpressure.
> > >
> > > OK, looks much better. Please also add a note to the documentation about
> > > this side effect.
> >
> > I don't want to add something to the documentation about throttling
> > networking because it seems like these are implementation details that
> > we may change in the future. I don't know if we can document this
> > behavior today and then change it later.
>
> The exact mechanism on how the throttling is done is one thing. This can
> change. But the fact that _no_ throttling is applied is something that
> we shouldn't change of course. If we were really strict we shouldn't
> change it even now but considering that the interface is new and
> usecases still shaping then better now than later.
>
> > How about we document a more generic statement in memory.reclaim
> > documentation, like:
> >
> > "With reactive reclaim operations triggered by the kernel, the kernel
> > may take further actions to alleviate memory pressure (such as
> > throttling networking memory consumption). For proactive reclaim
> > operations triggered by this interface, the kernel may choose to skip
> > such actions as reclaim is not an indication of memory pressure."
>
> IDK, this sounds too much word lawyering to me TBH. It is better to be clear
> about explicitly known side effects. For example where do shrinkers
> stand in the light of above wording? Kernel can chose to do almost
> anything and I do not think we want to control which shrinkers are
> triggered and what they do.
>
> So I would really prefer to say something like:
> "
> Please note that the proactive reclaim (triggered by this interface) is
> not meant to indicate memory pressure on the memory cgroup. Therefore
> socket memory balancing triggered by the memory reclaim normally is not
> exercised in this case. This means that the networking layer will not
> adapt based on reclaim induced by memory.reclaim.
> "

Sounds good to me! Will send v5 shortly with added doc changes and the
newly agreed upon commit log. Thanks Michal!

> --
> Michal Hocko
> SUSE Labs
diff mbox series

Patch

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 0c0fed1b348f..f6e9eaa2339f 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -411,10 +411,13 @@  extern void lru_cache_add_inactive_or_unevictable(struct page *page,
 extern unsigned long zone_reclaimable_pages(struct zone *zone);
 extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 					gfp_t gfp_mask, nodemask_t *mask);
+
+#define MEMCG_RECLAIM_MAY_SWAP (1 << 1)
+#define MEMCG_RECLAIM_PROACTIVE (1 << 2)
 extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
 						  unsigned long nr_pages,
 						  gfp_t gfp_mask,
-						  bool may_swap);
+						  unsigned int reclaim_options);
 extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem,
 						gfp_t gfp_mask, bool noswap,
 						pg_data_t *pgdat,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index a550042d88c3..b668224142c7 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2331,7 +2331,8 @@  static unsigned long reclaim_high(struct mem_cgroup *memcg,
 
 		psi_memstall_enter(&pflags);
 		nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
-							     gfp_mask, true);
+							gfp_mask,
+							MEMCG_RECLAIM_MAY_SWAP);
 		psi_memstall_leave(&pflags);
 	} while ((memcg = parent_mem_cgroup(memcg)) &&
 		 !mem_cgroup_is_root(memcg));
@@ -2576,7 +2577,7 @@  static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
 	struct page_counter *counter;
 	unsigned long nr_reclaimed;
 	bool passed_oom = false;
-	bool may_swap = true;
+	unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP;
 	bool drained = false;
 	unsigned long pflags;
 
@@ -2593,7 +2594,7 @@  static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
 		mem_over_limit = mem_cgroup_from_counter(counter, memory);
 	} else {
 		mem_over_limit = mem_cgroup_from_counter(counter, memsw);
-		may_swap = false;
+		reclaim_options &= ~MEMCG_RECLAIM_MAY_SWAP;
 	}
 
 	if (batch > nr_pages) {
@@ -2620,7 +2621,7 @@  static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
 
 	psi_memstall_enter(&pflags);
 	nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
-						    gfp_mask, may_swap);
+						    gfp_mask, reclaim_options);
 	psi_memstall_leave(&pflags);
 
 	if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
@@ -3402,8 +3403,8 @@  static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
 			continue;
 		}
 
-		if (!try_to_free_mem_cgroup_pages(memcg, 1,
-					GFP_KERNEL, !memsw)) {
+		if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
+					memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP)) {
 			ret = -EBUSY;
 			break;
 		}
@@ -3513,7 +3514,8 @@  static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
 		if (signal_pending(current))
 			return -EINTR;
 
-		if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true))
+		if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
+						  MEMCG_RECLAIM_MAY_SWAP))
 			nr_retries--;
 	}
 
@@ -6248,7 +6250,7 @@  static ssize_t memory_high_write(struct kernfs_open_file *of,
 		}
 
 		reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
-							 GFP_KERNEL, true);
+					GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP);
 
 		if (!reclaimed && !nr_retries--)
 			break;
@@ -6297,7 +6299,7 @@  static ssize_t memory_max_write(struct kernfs_open_file *of,
 
 		if (nr_reclaims) {
 			if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
-							  GFP_KERNEL, true))
+					GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP))
 				nr_reclaims--;
 			continue;
 		}
@@ -6426,6 +6428,7 @@  static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
 	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
 	unsigned int nr_retries = MAX_RECLAIM_RETRIES;
 	unsigned long nr_to_reclaim, nr_reclaimed = 0;
+	unsigned int reclaim_options;
 	int err;
 
 	buf = strstrip(buf);
@@ -6433,6 +6436,7 @@  static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
 	if (err)
 		return err;
 
+	reclaim_options	= MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE;
 	while (nr_reclaimed < nr_to_reclaim) {
 		unsigned long reclaimed;
 
@@ -6449,7 +6453,7 @@  static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
 
 		reclaimed = try_to_free_mem_cgroup_pages(memcg,
 						nr_to_reclaim - nr_reclaimed,
-						GFP_KERNEL, true);
+						GFP_KERNEL, reclaim_options);
 
 		if (!reclaimed && !nr_retries--)
 			return -EAGAIN;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f7d9a683e3a7..0969e6408a53 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -102,6 +102,9 @@  struct scan_control {
 	/* Can pages be swapped as part of reclaim? */
 	unsigned int may_swap:1;
 
+	/* Proactive reclaim invoked by userspace through memory.reclaim */
+	unsigned int proactive:1;
+
 	/*
 	 * Cgroup memory below memory.low is protected as long as we
 	 * don't threaten to OOM. If any cgroup is reclaimed at
@@ -3125,9 +3128,10 @@  static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
 			    sc->priority);
 
 		/* Record the group's reclaim efficiency */
-		vmpressure(sc->gfp_mask, memcg, false,
-			   sc->nr_scanned - scanned,
-			   sc->nr_reclaimed - reclaimed);
+		if (!sc->proactive)
+			vmpressure(sc->gfp_mask, memcg, false,
+				   sc->nr_scanned - scanned,
+				   sc->nr_reclaimed - reclaimed);
 
 	} while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL)));
 }
@@ -3250,9 +3254,10 @@  static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
 	}
 
 	/* Record the subtree's reclaim efficiency */
-	vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
-		   sc->nr_scanned - nr_scanned,
-		   sc->nr_reclaimed - nr_reclaimed);
+	if (!sc->proactive)
+		vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
+			   sc->nr_scanned - nr_scanned,
+			   sc->nr_reclaimed - nr_reclaimed);
 
 	if (sc->nr_reclaimed - nr_reclaimed)
 		reclaimable = true;
@@ -3534,8 +3539,9 @@  static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 		__count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1);
 
 	do {
-		vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
-				sc->priority);
+		if (!sc->proactive)
+			vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
+					sc->priority);
 		sc->nr_scanned = 0;
 		shrink_zones(zonelist, sc);
 
@@ -3825,7 +3831,7 @@  unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
 unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
 					   unsigned long nr_pages,
 					   gfp_t gfp_mask,
-					   bool may_swap)
+					   unsigned int reclaim_options)
 {
 	unsigned long nr_reclaimed;
 	unsigned int noreclaim_flag;
@@ -3838,7 +3844,8 @@  unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
 		.priority = DEF_PRIORITY,
 		.may_writepage = !laptop_mode,
 		.may_unmap = 1,
-		.may_swap = may_swap,
+		.may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP),
+		.proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE),
 	};
 	/*
 	 * Traverse the ZONELIST_FALLBACK zonelist of the current node to put