diff mbox series

[v2,5/5] memcg, inode: protect page cache from freeing inode

Message ID 1577174006-13025-6-git-send-email-laoar.shao@gmail.com (mailing list archive)
State New, archived
Headers show
Series protect page cache from freeing inode | expand

Commit Message

Yafang Shao Dec. 24, 2019, 7:53 a.m. UTC
On my server there're some running MEMCGs protected by memory.{min, low},
but I found the usage of these MEMCGs abruptly became very small, which
were far less than the protect limit. It confused me and finally I
found that was because of inode stealing.
Once an inode is freed, all its belonging page caches will be dropped as
well, no matter how may page caches it has. So if we intend to protect the
page caches in a memcg, we must protect their host (the inode) first.
Otherwise the memcg protection can be easily bypassed with freeing inode,
especially if there're big files in this memcg.

Supposes we have a memcg, and the stat of this memcg is,
	memory.current = 1024M
	memory.min = 512M
And in this memcg there's a inode with 800M page caches.
Once this memcg is scanned by kswapd or other regular reclaimers,
    kswapd <<<< It can be either of the regular reclaimers.
        shrink_node_memcgs
	    switch (mem_cgroup_protected()) <<<< Not protected
		case MEMCG_PROT_NONE:  <<<< Will scan this memcg
			beak;
            shrink_lruvec() <<<< Reclaim the page caches
            shrink_slab()   <<<< It may free this inode and drop all its
                                 page caches(800M).
So we must protect the inode first if we want to protect page caches.

The inherent mismatch between memcg and inode is a trouble. One inode can
be shared by different MEMCGs, but it is a very rare case. If an inode is
shared, its belonging page caches may be charged to different MEMCGs.
Currently there's no perfect solution to fix this kind of issue, but the
inode majority-writer ownership switching can help it more or less.

Cc: Roman Gushchin <guro@fb.com>
Cc: Chris Down <chris@chrisdown.name>
Cc: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
---
 fs/inode.c                 | 25 +++++++++++++++++++++++--
 include/linux/memcontrol.h | 11 ++++++++++-
 mm/memcontrol.c            | 43 +++++++++++++++++++++++++++++++++++++++++++
 mm/vmscan.c                |  5 +++++
 4 files changed, 81 insertions(+), 3 deletions(-)

Comments

kernel test robot Dec. 25, 2019, 1:01 p.m. UTC | #1
Hi Yafang,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on linus/master]
[also build test ERROR on v5.5-rc3 next-20191219]
[cannot apply to mmotm/master]
[if your patch is applied to the wrong git tree, please drop us a note to help
improve the system. BTW, we also suggest to use '--base' option to specify the
base tree in git format-patch, please see https://stackoverflow.com/a/37406982]

url:    https://github.com/0day-ci/linux/commits/Yafang-Shao/protect-page-cache-from-freeing-inode/20191225-193636
base:   https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git 46cf053efec6a3a5f343fead837777efe8252a46
config: um-x86_64_defconfig (attached as .config)
compiler: gcc-7 (Debian 7.5.0-3) 7.5.0
reproduce:
        # save the attached .config to linux build tree
        make ARCH=um SUBARCH=x86_64

If you fix the issue, kindly add following tag
Reported-by: kbuild test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   In file included from include/linux/swap.h:9:0,
                    from mm/vmscan.c:22:
   include/linux/memcontrol.h:872:23: error: conflicting types for 'memcg'
            struct inode *memcg)
                          ^~~~~
   include/linux/memcontrol.h:871:63: note: previous definition of 'memcg' was here
    static inline bool memcg_can_reclaim_inode(struct mem_cgroup *memcg,
                                                                  ^~~~~
   mm/vmscan.c: In function 'shrink_node_memcgs':
>> mm/vmscan.c:2670:9: error: dereferencing pointer to incomplete type 'struct mem_cgroup'
       memcg->in_low_reclaim = 1;
            ^~
--
   In file included from include/linux/swap.h:9:0,
                    from fs/inode.c:11:
   include/linux/memcontrol.h:872:23: error: conflicting types for 'memcg'
            struct inode *memcg)
                          ^~~~~
   include/linux/memcontrol.h:871:63: note: previous definition of 'memcg' was here
    static inline bool memcg_can_reclaim_inode(struct mem_cgroup *memcg,
                                                                  ^~~~~
   fs/inode.c: In function 'prune_icache_sb':
>> fs/inode.c:822:7: error: 'struct inode_head' has no member named 'memcg'
     ihead.memcg = sc->memcg;
          ^

vim +2670 mm/vmscan.c

  2639	
  2640	static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
  2641	{
  2642		struct mem_cgroup *target_memcg = sc->target_mem_cgroup;
  2643		struct mem_cgroup *memcg;
  2644	
  2645		memcg = mem_cgroup_iter(target_memcg, NULL, NULL);
  2646		do {
  2647			struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
  2648			unsigned long reclaimed;
  2649			unsigned long scanned;
  2650	
  2651			switch (mem_cgroup_protected(target_memcg, memcg)) {
  2652			case MEMCG_PROT_MIN:
  2653				/*
  2654				 * Hard protection.
  2655				 * If there is no reclaimable memory, OOM.
  2656				 */
  2657				continue;
  2658			case MEMCG_PROT_LOW:
  2659				/*
  2660				 * Soft protection.
  2661				 * Respect the protection only as long as
  2662				 * there is an unprotected supply
  2663				 * of reclaimable memory from other cgroups.
  2664				 */
  2665				if (!sc->memcg_low_reclaim) {
  2666					sc->memcg_low_skipped = 1;
  2667					continue;
  2668				}
  2669	
> 2670				memcg->in_low_reclaim = 1;
  2671				memcg_memory_event(memcg, MEMCG_LOW);
  2672				break;
  2673			case MEMCG_PROT_NONE:
  2674				/*
  2675				 * All protection thresholds breached. We may
  2676				 * still choose to vary the scan pressure
  2677				 * applied based on by how much the cgroup in
  2678				 * question has exceeded its protection
  2679				 * thresholds (see get_scan_count).
  2680				 */
  2681				break;
  2682			case MEMCG_PROT_SKIP:
  2683				/*
  2684				 * Skip scanning this memcg if the usage of it is
  2685				 * zero.
  2686				 */
  2687				continue;
  2688			}
  2689	
  2690			reclaimed = sc->nr_reclaimed;
  2691			scanned = sc->nr_scanned;
  2692	
  2693			shrink_lruvec(lruvec, sc);
  2694	
  2695			shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
  2696				    sc->priority);
  2697	
  2698			if (memcg->in_low_reclaim)
  2699				memcg->in_low_reclaim = 0;
  2700	
  2701			/* Record the group's reclaim efficiency */
  2702			vmpressure(sc->gfp_mask, memcg, false,
  2703				   sc->nr_scanned - scanned,
  2704				   sc->nr_reclaimed - reclaimed);
  2705	
  2706		} while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL)));
  2707	}
  2708	

---
0-DAY kernel test infrastructure                 Open Source Technology Center
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org Intel Corporation
kernel test robot Dec. 25, 2019, 1:18 p.m. UTC | #2
Hi Yafang,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on linus/master]
[also build test ERROR on v5.5-rc3 next-20191220]
[cannot apply to mmotm/master]
[if your patch is applied to the wrong git tree, please drop us a note to help
improve the system. BTW, we also suggest to use '--base' option to specify the
base tree in git format-patch, please see https://stackoverflow.com/a/37406982]

url:    https://github.com/0day-ci/linux/commits/Yafang-Shao/protect-page-cache-from-freeing-inode/20191225-193636
base:   https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git 46cf053efec6a3a5f343fead837777efe8252a46
config: i386-tinyconfig (attached as .config)
compiler: gcc-7 (Debian 7.5.0-3) 7.5.0
reproduce:
        # save the attached .config to linux build tree
        make ARCH=i386 

If you fix the issue, kindly add following tag
Reported-by: kbuild test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   In file included from include/linux/swap.h:9:0,
                    from include/linux/suspend.h:5,
                    from arch/x86/kernel/asm-offsets.c:13:
>> include/linux/memcontrol.h:872:23: error: conflicting types for 'memcg'
            struct inode *memcg)
                          ^~~~~
   include/linux/memcontrol.h:871:63: note: previous definition of 'memcg' was here
    static inline bool memcg_can_reclaim_inode(struct mem_cgroup *memcg,
                                                                  ^~~~~
   make[2]: *** [arch/x86/kernel/asm-offsets.s] Error 1
   make[2]: Target '__build' not remade because of errors.
   make[1]: *** [prepare0] Error 2
   make[1]: Target 'prepare' not remade because of errors.
   make: *** [sub-make] Error 2
   9 real  4 user  3 sys  93.40% cpu 	make prepare

vim +/memcg +872 include/linux/memcontrol.h

   870	
   871	static inline bool memcg_can_reclaim_inode(struct mem_cgroup *memcg,
 > 872						   struct inode *memcg)
   873	{
   874		return true;
   875	}
   876	

---
0-DAY kernel test infrastructure                 Open Source Technology Center
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org Intel Corporation
Yafang Shao Dec. 26, 2019, 5:09 a.m. UTC | #3
On Wed, Dec 25, 2019 at 9:19 PM kbuild test robot <lkp@intel.com> wrote:
>
> Hi Yafang,
>
> Thank you for the patch! Yet something to improve:
>
> [auto build test ERROR on linus/master]
> [also build test ERROR on v5.5-rc3 next-20191220]
> [cannot apply to mmotm/master]
> [if your patch is applied to the wrong git tree, please drop us a note to help
> improve the system. BTW, we also suggest to use '--base' option to specify the
> base tree in git format-patch, please see https://stackoverflow.com/a/37406982]
>
> url:    https://github.com/0day-ci/linux/commits/Yafang-Shao/protect-page-cache-from-freeing-inode/20191225-193636
> base:   https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git 46cf053efec6a3a5f343fead837777efe8252a46
> config: i386-tinyconfig (attached as .config)
> compiler: gcc-7 (Debian 7.5.0-3) 7.5.0
> reproduce:
>         # save the attached .config to linux build tree
>         make ARCH=i386
>
> If you fix the issue, kindly add following tag
> Reported-by: kbuild test robot <lkp@intel.com>
>
> All errors (new ones prefixed by >>):
>
>    In file included from include/linux/swap.h:9:0,
>                     from include/linux/suspend.h:5,
>                     from arch/x86/kernel/asm-offsets.c:13:
> >> include/linux/memcontrol.h:872:23: error: conflicting types for 'memcg'
>             struct inode *memcg)
>                           ^~~~~
>    include/linux/memcontrol.h:871:63: note: previous definition of 'memcg' was here
>     static inline bool memcg_can_reclaim_inode(struct mem_cgroup *memcg,
>                                                                   ^~~~~
>    make[2]: *** [arch/x86/kernel/asm-offsets.s] Error 1
>    make[2]: Target '__build' not remade because of errors.
>    make[1]: *** [prepare0] Error 2
>    make[1]: Target 'prepare' not remade because of errors.
>    make: *** [sub-make] Error 2
>    9 real  4 user  3 sys  93.40% cpu    make prepare
>
> vim +/memcg +872 include/linux/memcontrol.h
>
>    870
>    871  static inline bool memcg_can_reclaim_inode(struct mem_cgroup *memcg,
>  > 872                                             struct inode *memcg)
>    873  {
>    874          return true;
>    875  }
>    876
>

Will fix this build error (when CONFIG_MEMCG_KMEM is not set) in next
version, thanks kbuild test robot.

Thanks
Yafang
Dave Chinner Jan. 4, 2020, 3:55 a.m. UTC | #4
On Tue, Dec 24, 2019 at 02:53:26AM -0500, Yafang Shao wrote:
> On my server there're some running MEMCGs protected by memory.{min, low},
> but I found the usage of these MEMCGs abruptly became very small, which
> were far less than the protect limit. It confused me and finally I
> found that was because of inode stealing.
> Once an inode is freed, all its belonging page caches will be dropped as
> well, no matter how may page caches it has. So if we intend to protect the
> page caches in a memcg, we must protect their host (the inode) first.
> Otherwise the memcg protection can be easily bypassed with freeing inode,
> especially if there're big files in this memcg.
> 
> Supposes we have a memcg, and the stat of this memcg is,
> 	memory.current = 1024M
> 	memory.min = 512M
> And in this memcg there's a inode with 800M page caches.
> Once this memcg is scanned by kswapd or other regular reclaimers,
>     kswapd <<<< It can be either of the regular reclaimers.
>         shrink_node_memcgs
> 	    switch (mem_cgroup_protected()) <<<< Not protected
> 		case MEMCG_PROT_NONE:  <<<< Will scan this memcg
> 			beak;
>             shrink_lruvec() <<<< Reclaim the page caches
>             shrink_slab()   <<<< It may free this inode and drop all its
>                                  page caches(800M).
> So we must protect the inode first if we want to protect page caches.
> 
> The inherent mismatch between memcg and inode is a trouble. One inode can
> be shared by different MEMCGs, but it is a very rare case. If an inode is
> shared, its belonging page caches may be charged to different MEMCGs.
> Currently there's no perfect solution to fix this kind of issue, but the
> inode majority-writer ownership switching can help it more or less.

There's multiple changes in this patch set. Yes, there's some inode
cache futzing to deal with memcgs, but it also adds some weird
undocumented "in low reclaim" heuristic that does something magical
with "protection" that you don't describe or document at all. PLease
separate that out into a new patch and document it clearly (both in
the commit message and the code, please).

> diff --git a/fs/inode.c b/fs/inode.c
> index fef457a..4f4b2f3 100644
> --- a/fs/inode.c
> +++ b/fs/inode.c
> @@ -54,6 +54,13 @@
>   *   inode_hash_lock
>   */
>  
> +struct inode_head {

not an "inode head" structure. They are inode isolation arguments.
i.e. struct inode_isolation_args {...};

> +	struct list_head *freeable;
> +#ifdef CONFIG_MEMCG_KMEM
> +	struct mem_cgroup *memcg;
> +#endif
> +};

These defines are awful, too, and completely unnecesarily because
the struct shrink_control unconditionally defines sc->memcg and
so we can just code it throughout without caring whether memcgs are
enabled or not.

> +
>  static unsigned int i_hash_mask __read_mostly;
>  static unsigned int i_hash_shift __read_mostly;
>  static struct hlist_head *inode_hashtable __read_mostly;
> @@ -724,8 +731,10 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty)
>  static enum lru_status inode_lru_isolate(struct list_head *item,
>  		struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
>  {
> -	struct list_head *freeable = arg;
> +	struct inode_head *ihead = (struct inode_head *)arg;

No need for a cast of a void *.

> +	struct list_head *freeable = ihead->freeable;
>  	struct inode	*inode = container_of(item, struct inode, i_lru);
> +	struct mem_cgroup *memcg = NULL;

	struct inode_isolation_args *iargs = arg;
	struct list_head *freeable = iargs->freeable;
	struct mem_cgroup *memcg = iargs->memcg;
	struct inode	*inode = container_of(item, struct inode, i_lru);

> @@ -734,6 +743,15 @@ static enum lru_status inode_lru_isolate(struct list_head *item,
>  	if (!spin_trylock(&inode->i_lock))
>  		return LRU_SKIP;
>  
> +#ifdef CONFIG_MEMCG_KMEM
> +	memcg = ihead->memcg;
> +#endif

This is no longer necessary.

> +	if (memcg && inode->i_data.nrpages &&
> +	    !(memcg_can_reclaim_inode(memcg, inode))) {
> +		spin_unlock(&inode->i_lock);
> +		return LRU_ROTATE;
> +	}

I'd argue that both the memcg and the inode->i_data.nrpages check
should be inside memcg_can_reclaim_inode(), that way this entire
chunk of code disappears when CONFIG_MEMCG_KMEM=N.

> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index f36ada9..d1d4175 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -247,6 +247,9 @@ struct mem_cgroup {
>  	unsigned int tcpmem_active : 1;
>  	unsigned int tcpmem_pressure : 1;
>  
> +	/* Soft protection will be ignored if it's true */
> +	unsigned int in_low_reclaim : 1;

This is the stuff that has nothing to do with the changes to the
inode reclaim shrinker...

Cheers,

Dave.
Yafang Shao Jan. 4, 2020, 7:42 a.m. UTC | #5
On Sat, Jan 4, 2020 at 11:55 AM Dave Chinner <david@fromorbit.com> wrote:
>
> On Tue, Dec 24, 2019 at 02:53:26AM -0500, Yafang Shao wrote:
> > On my server there're some running MEMCGs protected by memory.{min, low},
> > but I found the usage of these MEMCGs abruptly became very small, which
> > were far less than the protect limit. It confused me and finally I
> > found that was because of inode stealing.
> > Once an inode is freed, all its belonging page caches will be dropped as
> > well, no matter how may page caches it has. So if we intend to protect the
> > page caches in a memcg, we must protect their host (the inode) first.
> > Otherwise the memcg protection can be easily bypassed with freeing inode,
> > especially if there're big files in this memcg.
> >
> > Supposes we have a memcg, and the stat of this memcg is,
> >       memory.current = 1024M
> >       memory.min = 512M
> > And in this memcg there's a inode with 800M page caches.
> > Once this memcg is scanned by kswapd or other regular reclaimers,
> >     kswapd <<<< It can be either of the regular reclaimers.
> >         shrink_node_memcgs
> >           switch (mem_cgroup_protected()) <<<< Not protected
> >               case MEMCG_PROT_NONE:  <<<< Will scan this memcg
> >                       beak;
> >             shrink_lruvec() <<<< Reclaim the page caches
> >             shrink_slab()   <<<< It may free this inode and drop all its
> >                                  page caches(800M).
> > So we must protect the inode first if we want to protect page caches.
> >
> > The inherent mismatch between memcg and inode is a trouble. One inode can
> > be shared by different MEMCGs, but it is a very rare case. If an inode is
> > shared, its belonging page caches may be charged to different MEMCGs.
> > Currently there's no perfect solution to fix this kind of issue, but the
> > inode majority-writer ownership switching can help it more or less.
>
> There's multiple changes in this patch set. Yes, there's some inode
> cache futzing to deal with memcgs, but it also adds some weird
> undocumented "in low reclaim" heuristic that does something magical
> with "protection" that you don't describe or document at all. PLease
> separate that out into a new patch and document it clearly (both in
> the commit message and the code, please).
>

Sure, I will separate it and document it in next version.

> > diff --git a/fs/inode.c b/fs/inode.c
> > index fef457a..4f4b2f3 100644
> > --- a/fs/inode.c
> > +++ b/fs/inode.c
> > @@ -54,6 +54,13 @@
> >   *   inode_hash_lock
> >   */
> >
> > +struct inode_head {
>
> not an "inode head" structure. They are inode isolation arguments.
> i.e. struct inode_isolation_args {...};
>

Agree with you that inode_isolation_args is better.
While I have a different idea, what about using inode_isolation_control ?
scan_control : to scan all MEMCGs
        v
shrink_control: to shrink all slabs in one memcg
        v
inode_isolation_control: to shrink one slab (the inode)

And in the future we may introduce dentry_isolation_control and some others.
Anyway that's a minor issue.

> > +     struct list_head *freeable;
> > +#ifdef CONFIG_MEMCG_KMEM
> > +     struct mem_cgroup *memcg;
> > +#endif
> > +};
>
> These defines are awful, too, and completely unnecesarily because
> the struct shrink_control unconditionally defines sc->memcg and
> so we can just code it throughout without caring whether memcgs are
> enabled or not.
>

Sure, will change it in next version.

> > +
> >  static unsigned int i_hash_mask __read_mostly;
> >  static unsigned int i_hash_shift __read_mostly;
> >  static struct hlist_head *inode_hashtable __read_mostly;
> > @@ -724,8 +731,10 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty)
> >  static enum lru_status inode_lru_isolate(struct list_head *item,
> >               struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
> >  {
> > -     struct list_head *freeable = arg;
> > +     struct inode_head *ihead = (struct inode_head *)arg;
>
> No need for a cast of a void *.
>

OK.

> > +     struct list_head *freeable = ihead->freeable;
> >       struct inode    *inode = container_of(item, struct inode, i_lru);
> > +     struct mem_cgroup *memcg = NULL;
>
>         struct inode_isolation_args *iargs = arg;
>         struct list_head *freeable = iargs->freeable;
>         struct mem_cgroup *memcg = iargs->memcg;
>         struct inode    *inode = container_of(item, struct inode, i_lru);
>

Thanks. That looks better.

> > @@ -734,6 +743,15 @@ static enum lru_status inode_lru_isolate(struct list_head *item,
> >       if (!spin_trylock(&inode->i_lock))
> >               return LRU_SKIP;
> >
> > +#ifdef CONFIG_MEMCG_KMEM
> > +     memcg = ihead->memcg;
> > +#endif
>
> This is no longer necessary.
>

Thanks.

> > +     if (memcg && inode->i_data.nrpages &&
> > +         !(memcg_can_reclaim_inode(memcg, inode))) {
> > +             spin_unlock(&inode->i_lock);
> > +             return LRU_ROTATE;
> > +     }
>
> I'd argue that both the memcg and the inode->i_data.nrpages check
> should be inside memcg_can_reclaim_inode(), that way this entire
> chunk of code disappears when CONFIG_MEMCG_KMEM=N.
>

I will think about it and make the code better when CONFIG_MEMCG_KMEM=N.

> > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> > index f36ada9..d1d4175 100644
> > --- a/include/linux/memcontrol.h
> > +++ b/include/linux/memcontrol.h
> > @@ -247,6 +247,9 @@ struct mem_cgroup {
> >       unsigned int tcpmem_active : 1;
> >       unsigned int tcpmem_pressure : 1;
> >
> > +     /* Soft protection will be ignored if it's true */
> > +     unsigned int in_low_reclaim : 1;
>
> This is the stuff that has nothing to do with the changes to the
> inode reclaim shrinker...
>

In next version I will drop this in_low_reclaim and using the
memcg_low_reclaim passed from struct scan_control.


Thanks
Yafang
diff mbox series

Patch

diff --git a/fs/inode.c b/fs/inode.c
index fef457a..4f4b2f3 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -54,6 +54,13 @@ 
  *   inode_hash_lock
  */
 
+struct inode_head {
+	struct list_head *freeable;
+#ifdef CONFIG_MEMCG_KMEM
+	struct mem_cgroup *memcg;
+#endif
+};
+
 static unsigned int i_hash_mask __read_mostly;
 static unsigned int i_hash_shift __read_mostly;
 static struct hlist_head *inode_hashtable __read_mostly;
@@ -724,8 +731,10 @@  int invalidate_inodes(struct super_block *sb, bool kill_dirty)
 static enum lru_status inode_lru_isolate(struct list_head *item,
 		struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
 {
-	struct list_head *freeable = arg;
+	struct inode_head *ihead = (struct inode_head *)arg;
+	struct list_head *freeable = ihead->freeable;
 	struct inode	*inode = container_of(item, struct inode, i_lru);
+	struct mem_cgroup *memcg = NULL;
 
 	/*
 	 * we are inverting the lru lock/inode->i_lock here, so use a trylock.
@@ -734,6 +743,15 @@  static enum lru_status inode_lru_isolate(struct list_head *item,
 	if (!spin_trylock(&inode->i_lock))
 		return LRU_SKIP;
 
+#ifdef CONFIG_MEMCG_KMEM
+	memcg = ihead->memcg;
+#endif
+	if (memcg && inode->i_data.nrpages &&
+	    !(memcg_can_reclaim_inode(memcg, inode))) {
+		spin_unlock(&inode->i_lock);
+		return LRU_ROTATE;
+	}
+
 	/*
 	 * Referenced or dirty inodes are still in use. Give them another pass
 	 * through the LRU as we canot reclaim them now.
@@ -789,11 +807,14 @@  static enum lru_status inode_lru_isolate(struct list_head *item,
  */
 long prune_icache_sb(struct super_block *sb, struct shrink_control *sc)
 {
+	struct inode_head ihead;
 	LIST_HEAD(freeable);
 	long freed;
 
+	ihead.freeable = &freeable;
+	ihead.memcg = sc->memcg;
 	freed = list_lru_shrink_walk(&sb->s_inode_lru, sc,
-				     inode_lru_isolate, &freeable);
+				     inode_lru_isolate, &ihead);
 	dispose_list(&freeable);
 	return freed;
 }
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index f36ada9..d1d4175 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -247,6 +247,9 @@  struct mem_cgroup {
 	unsigned int tcpmem_active : 1;
 	unsigned int tcpmem_pressure : 1;
 
+	/* Soft protection will be ignored if it's true */
+	unsigned int in_low_reclaim : 1;
+
 	int under_oom;
 
 	int	swappiness;
@@ -363,7 +366,7 @@  static inline unsigned long mem_cgroup_protection(struct mem_cgroup *memcg,
 
 enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
 						struct mem_cgroup *memcg);
-
+bool memcg_can_reclaim_inode(struct mem_cgroup *memcg, struct inode *inode);
 int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
 			  gfp_t gfp_mask, struct mem_cgroup **memcgp,
 			  bool compound);
@@ -865,6 +868,12 @@  static inline enum mem_cgroup_protection mem_cgroup_protected(
 	return MEMCG_PROT_NONE;
 }
 
+static inline bool memcg_can_reclaim_inode(struct mem_cgroup *memcg,
+					   struct inode *memcg)
+{
+	return true;
+}
+
 static inline int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
 					gfp_t gfp_mask,
 					struct mem_cgroup **memcgp,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2fc2bf4..c3498fd 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -6340,6 +6340,49 @@  enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
 }
 
 /**
+ * Once an inode is freed, all its belonging page caches will be dropped as
+ * well, even if there're lots of page caches. So if we intend to protect
+ * page caches in a memcg, we must protect their host(the inode) first.
+ * Otherwise the memcg protection can be easily bypassed with freeing inode,
+ * especially if there're big files in this memcg.
+ * Note that it may happen that the page caches are already charged to the
+ * memcg, but the inode hasn't been added to this memcg yet. In this case,
+ * this inode is not protected.
+ * The inherent mismatch between memcg and inode is a trouble. One inode
+ * can be shared by different MEMCGs, but it is a very rare case. If
+ * an inode is shared, its belonging page caches may be charged to
+ * different MEMCGs. Currently there's no perfect solution to fix this
+ * kind of issue, but the inode majority-writer ownership switching can
+ * help it more or less.
+ */
+bool memcg_can_reclaim_inode(struct mem_cgroup *memcg,
+			     struct inode *inode)
+{
+	unsigned long cgroup_size;
+	unsigned long protection;
+	bool reclaimable = true;
+
+	if (memcg == root_mem_cgroup)
+		goto out;
+
+	protection = mem_cgroup_protection(memcg, memcg->in_low_reclaim);
+	if (!protection)
+		goto out;
+
+	/*
+	 * Don't protect this inode if the usage of this memcg is still
+	 * above the protection after reclaiming this inode and all its
+	 * belonging page caches.
+	 */
+	cgroup_size = mem_cgroup_size(memcg);
+	if (inode->i_data.nrpages + protection > cgroup_size)
+		reclaimable = false;
+
+out:
+	return reclaimable;
+}
+
+/**
  * mem_cgroup_try_charge - try charging a page
  * @page: page to charge
  * @mm: mm context of the victim
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 3c4c2da..ecc5c1d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2666,6 +2666,8 @@  static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
 				sc->memcg_low_skipped = 1;
 				continue;
 			}
+
+			memcg->in_low_reclaim = 1;
 			memcg_memory_event(memcg, MEMCG_LOW);
 			break;
 		case MEMCG_PROT_NONE:
@@ -2693,6 +2695,9 @@  static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
 		shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
 			    sc->priority);
 
+		if (memcg->in_low_reclaim)
+			memcg->in_low_reclaim = 0;
+
 		/* Record the group's reclaim efficiency */
 		vmpressure(sc->gfp_mask, memcg, false,
 			   sc->nr_scanned - scanned,