Message ID | 20161223144738.GB23117@dhcp22.suse.cz (mailing list archive) |
---|---|
State | Not Applicable |
Headers | show |
On Fri, Dec 23, 2016 at 03:47:39PM +0100, Michal Hocko wrote: > > Nils, even though this is still highly experimental, could you give it a > try please? Yes, no problem! So I kept the very first patch you sent but had to revert the latest version of the debugging patch (the one in which you added the "mm_vmscan_inactive_list_is_low" event) because otherwise the patch you just sent wouldn't apply. Then I rebooted with memory cgroups enabled again, and the first thing that strikes the eye is that I get this during boot: [ 1.568174] ------------[ cut here ]------------ [ 1.568327] WARNING: CPU: 0 PID: 1 at mm/memcontrol.c:1032 mem_cgroup_update_lru_size+0x118/0x130 [ 1.568543] mem_cgroup_update_lru_size(f4406400, 2, 1): lru_size 0 but not empty [ 1.568754] Modules linked in: [ 1.568922] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.9.0-gentoo #6 [ 1.569052] Hardware name: Hewlett-Packard Compaq 15 Notebook PC/21F7, BIOS F.22 08/06/2014 [ 1.571750] f44e5b84 c142bdee f44e5bc8 c1b5ade0 f44e5bb4 c103ab1d c1b583e4 f44e5be4 [ 1.572262] 00000001 c1b5ade0 00000408 c11603d8 00000408 00000000 c1b5af73 00000001 [ 1.572774] f44e5bd0 c103ab76 00000009 00000000 f44e5bc8 c1b583e4 f44e5be4 f44e5c18 [ 1.573285] Call Trace: [ 1.573419] [<c142bdee>] dump_stack+0x47/0x69 [ 1.573551] [<c103ab1d>] __warn+0xed/0x110 [ 1.573681] [<c11603d8>] ? mem_cgroup_update_lru_size+0x118/0x130 [ 1.573812] [<c103ab76>] warn_slowpath_fmt+0x36/0x40 [ 1.573942] [<c11603d8>] mem_cgroup_update_lru_size+0x118/0x130 [ 1.574076] [<c1111467>] __pagevec_lru_add_fn+0xd7/0x1b0 [ 1.574206] [<c1111390>] ? perf_trace_mm_lru_insertion+0x150/0x150 [ 1.574336] [<c111239d>] pagevec_lru_move_fn+0x4d/0x80 [ 1.574465] [<c1111390>] ? perf_trace_mm_lru_insertion+0x150/0x150 [ 1.574595] [<c11127e5>] __lru_cache_add+0x45/0x60 [ 1.574724] [<c1112848>] lru_cache_add+0x8/0x10 [ 1.574852] [<c1102fc1>] add_to_page_cache_lru+0x61/0xc0 [ 1.574982] [<c110418e>] pagecache_get_page+0xee/0x270 [ 1.575111] [<c11060f0>] grab_cache_page_write_begin+0x20/0x40 [ 1.575243] [<c118b955>] simple_write_begin+0x25/0xd0 [ 1.575372] [<c11061b8>] generic_perform_write+0xa8/0x1a0 [ 1.575503] [<c1106447>] __generic_file_write_iter+0x197/0x1f0 [ 1.575634] [<c110663f>] generic_file_write_iter+0x19f/0x2b0 [ 1.575766] [<c11669c1>] __vfs_write+0xd1/0x140 [ 1.575897] [<c1166bc5>] vfs_write+0x95/0x1b0 [ 1.576026] [<c1166daf>] SyS_write+0x3f/0x90 [ 1.576157] [<c1ce4474>] xwrite+0x1c/0x4b [ 1.576285] [<c1ce44c5>] do_copy+0x22/0xac [ 1.576413] [<c1ce42c3>] write_buffer+0x1d/0x2c [ 1.576540] [<c1ce42f0>] flush_buffer+0x1e/0x70 [ 1.576670] [<c1d0eae8>] unxz+0x149/0x211 [ 1.576798] [<c1d0e99f>] ? unlzo+0x359/0x359 [ 1.576926] [<c1ce4946>] unpack_to_rootfs+0x14f/0x246 [ 1.577054] [<c1ce42d2>] ? write_buffer+0x2c/0x2c [ 1.577183] [<c1ce4216>] ? initrd_load+0x3b/0x3b [ 1.577312] [<c1ce4b20>] ? maybe_link.part.3+0xe3/0xe3 [ 1.577443] [<c1ce4b67>] populate_rootfs+0x47/0x8f [ 1.577573] [<c1000456>] do_one_initcall+0x36/0x150 [ 1.577701] [<c1ce351e>] ? repair_env_string+0x12/0x54 [ 1.577832] [<c1054ded>] ? parse_args+0x25d/0x400 [ 1.577962] [<c1ce3baf>] ? kernel_init_freeable+0x101/0x19e [ 1.578092] [<c1ce3bcf>] kernel_init_freeable+0x121/0x19e [ 1.578222] [<c19b0700>] ? rest_init+0x60/0x60 [ 1.578350] [<c19b070b>] kernel_init+0xb/0x100 [ 1.578480] [<c1060c7c>] ? schedule_tail+0xc/0x50 [ 1.578608] [<c19b0700>] ? rest_init+0x60/0x60 [ 1.578737] [<c19b5db7>] ret_from_fork+0x1b/0x28 [ 1.578871] ---[ end trace cf6f1adac9dfe60e ]--- The machine then continued to boot just normally, however, so I started my ordinary tests. And in fact, they were working just fine, i.e. no OOMing anymore, even during heavy tarball unpacking. Would it make sense to capture more trace data for you at this point? As I'm on the go, I don't currently have a second machine for capturing over the network, but since we're not having OOMs or other issues now, capturing to file should probably work just fine. I'll keep the patch applied and see if I notice anything else that doesn't look normal during day to day usage, especially during my ordinary Gentoo updates, which consist of a lot of fetching / unpacking / building, and in the recent past had been very problematic (in fact, that was where the problem first struck me and the "heavy tarball unpacking" test was then just what I distilled it down to in order to manually reproduce this with the least time and effort possible). Greetings Nils -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon 26-12-16 06:25:56, kernel test robot wrote: > > FYI, we noticed the following commit: > > commit: d18e2b2aca0396849f588241e134787a829c707d ("mm, memcg: fix (Re: OOM: Better, but still there on)") > url: https://github.com/0day-ci/linux/commits/Michal-Hocko/mm-memcg-fix-Re-OOM-Better-but-still-there-on/20161223-225057 > base: git://git.cmpxchg.org/linux-mmotm.git master > > in testcase: boot > > on test machine: qemu-system-i386 -enable-kvm -m 360M > > caused below changes: > > > +--------------------------------------------------------+------------+------------+ > | | c7d85b880b | d18e2b2aca | > +--------------------------------------------------------+------------+------------+ > | boot_successes | 8 | 0 | > | boot_failures | 0 | 2 | > | WARNING:at_mm/memcontrol.c:#mem_cgroup_update_lru_size | 0 | 2 | > | kernel_BUG_at_mm/memcontrol.c | 0 | 2 | > | invalid_opcode:#[##]DEBUG_PAGEALLOC | 0 | 2 | > | Kernel_panic-not_syncing:Fatal_exception | 0 | 2 | > +--------------------------------------------------------+------------+------------+ > > > > [ 95.226364] init: tty6 main process (990) killed by TERM signal > [ 95.314020] init: plymouth-upstart-bridge main process (1039) terminated with status 1 > [ 97.588568] ------------[ cut here ]------------ > [ 97.594364] WARNING: CPU: 0 PID: 1055 at mm/memcontrol.c:1032 mem_cgroup_update_lru_size+0xdd/0x12b > [ 97.606654] mem_cgroup_update_lru_size(40297f00, 0, -1): lru_size 1 but empty > [ 97.615140] Modules linked in: > [ 97.618834] CPU: 0 PID: 1055 Comm: killall5 Not tainted 4.9.0-mm1-00095-gd18e2b2 #82 > [ 97.628008] Call Trace: > [ 97.631025] dump_stack+0x16/0x18 > [ 97.635107] __warn+0xaf/0xc6 > [ 97.638729] ? mem_cgroup_update_lru_size+0xdd/0x12b Do you have the full backtrace?
On Mon 26-12-16 13:26:51, Michal Hocko wrote: > On Mon 26-12-16 06:25:56, kernel test robot wrote: [...] > > [ 95.226364] init: tty6 main process (990) killed by TERM signal > > [ 95.314020] init: plymouth-upstart-bridge main process (1039) terminated with status 1 > > [ 97.588568] ------------[ cut here ]------------ > > [ 97.594364] WARNING: CPU: 0 PID: 1055 at mm/memcontrol.c:1032 mem_cgroup_update_lru_size+0xdd/0x12b > > [ 97.606654] mem_cgroup_update_lru_size(40297f00, 0, -1): lru_size 1 but empty > > [ 97.615140] Modules linked in: > > [ 97.618834] CPU: 0 PID: 1055 Comm: killall5 Not tainted 4.9.0-mm1-00095-gd18e2b2 #82 > > [ 97.628008] Call Trace: > > [ 97.631025] dump_stack+0x16/0x18 > > [ 97.635107] __warn+0xaf/0xc6 > > [ 97.638729] ? mem_cgroup_update_lru_size+0xdd/0x12b > > Do you have the full backtrace? It's not needed. I found the bug in my patch and it should be fixed by the updated patch http://lkml.kernel.org/r/20161226124839.GB20715@dhcp22.suse.cz
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 61d20c17f3b7..002cb08b0f3e 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -120,7 +120,7 @@ struct mem_cgroup_reclaim_iter { */ struct mem_cgroup_per_node { struct lruvec lruvec; - unsigned long lru_size[NR_LRU_LISTS]; + unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS]; struct mem_cgroup_reclaim_iter iter[DEF_PRIORITY + 1]; @@ -432,7 +432,7 @@ static inline bool mem_cgroup_online(struct mem_cgroup *memcg) int mem_cgroup_select_victim_node(struct mem_cgroup *memcg); void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, - int nr_pages); + int zid, int nr_pages); unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, int nid, unsigned int lru_mask); @@ -441,9 +441,23 @@ static inline unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) { struct mem_cgroup_per_node *mz; + unsigned long nr_pages = 0; + int zid; mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); - return mz->lru_size[lru]; + for (zid = 0; zid < MAX_NR_ZONES; zid++) + nr_pages += mz->lru_zone_size[zid][lru]; + return nr_pages; +} + +static inline +unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec, enum lru_list lru, + int zone_idx) +{ + struct mem_cgroup_per_node *mz; + + mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); + return mz->lru_zone_size[zone_idx][lru]; } void mem_cgroup_handle_over_high(void); @@ -671,6 +685,12 @@ mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) { return 0; } +static inline +unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec, enum lru_list lru, + int zone_idx) +{ + return 0; +} static inline unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 71613e8a720f..41d376e7116d 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -39,7 +39,7 @@ static __always_inline void update_lru_size(struct lruvec *lruvec, { __update_lru_size(lruvec, lru, zid, nr_pages); #ifdef CONFIG_MEMCG - mem_cgroup_update_lru_size(lruvec, lru, nr_pages); + mem_cgroup_update_lru_size(lruvec, lru, zid, nr_pages); #endif } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 91dfc7c5ce8f..f4e9c4d49df3 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -625,8 +625,8 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, int nid, unsigned int lru_mask) { + struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg); unsigned long nr = 0; - struct mem_cgroup_per_node *mz; enum lru_list lru; VM_BUG_ON((unsigned)nid >= nr_node_ids); @@ -634,8 +634,7 @@ unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, for_each_lru(lru) { if (!(BIT(lru) & lru_mask)) continue; - mz = mem_cgroup_nodeinfo(memcg, nid); - nr += mz->lru_size[lru]; + nr += mem_cgroup_get_lru_size(lruvec, lru); } return nr; } @@ -1002,6 +1001,7 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgd * mem_cgroup_update_lru_size - account for adding or removing an lru page * @lruvec: mem_cgroup per zone lru vector * @lru: index of lru list the page is sitting on + * @zid: zone id of the accounted pages * @nr_pages: positive when adding or negative when removing * * This function must be called under lru_lock, just before a page is added @@ -1009,7 +1009,7 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgd * so as to allow it to check that lru_size 0 is consistent with list_empty). */ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, - int nr_pages) + int zid, int nr_pages) { struct mem_cgroup_per_node *mz; unsigned long *lru_size; @@ -1020,7 +1020,7 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, return; mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); - lru_size = mz->lru_size + lru; + lru_size = &mz->lru_zone_size[zid][lru]; empty = list_empty(lruvec->lists + lru); if (nr_pages < 0) @@ -1036,6 +1036,7 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, if (nr_pages > 0) *lru_size += nr_pages; + mz->lru_zone_size[zid][lru] += nr_pages; } bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg) diff --git a/mm/vmscan.c b/mm/vmscan.c index c4abf08861d2..c98b1a585992 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -242,6 +242,15 @@ unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru) return node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru); } +unsigned long lruvec_zone_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx) +{ + if (!mem_cgroup_disabled()) + return mem_cgroup_get_zone_lru_size(lruvec, lru, zone_idx); + + return zone_page_state(&lruvec_pgdat(lruvec)->node_zones[zone_idx], + NR_ZONE_LRU_BASE + lru); +} + /* * Add a shrinker callback to be called from the vm. */ @@ -1382,8 +1391,7 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode) * be complete before mem_cgroup_update_lru_size due to a santity check. */ static __always_inline void update_lru_sizes(struct lruvec *lruvec, - enum lru_list lru, unsigned long *nr_zone_taken, - unsigned long nr_taken) + enum lru_list lru, unsigned long *nr_zone_taken) { int zid; @@ -1392,11 +1400,11 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec, continue; __update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]); - } - #ifdef CONFIG_MEMCG - mem_cgroup_update_lru_size(lruvec, lru, -nr_taken); + mem_cgroup_update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]); #endif + } + } /* @@ -1501,7 +1509,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, *nr_scanned = scan; trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan, scan, nr_taken, mode, is_file_lru(lru)); - update_lru_sizes(lruvec, lru, nr_zone_taken, nr_taken); + update_lru_sizes(lruvec, lru, nr_zone_taken); return nr_taken; } @@ -2047,10 +2055,8 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file, if (!managed_zone(zone)) continue; - inactive_zone = zone_page_state(zone, - NR_ZONE_LRU_BASE + (file * LRU_FILE)); - active_zone = zone_page_state(zone, - NR_ZONE_LRU_BASE + (file * LRU_FILE) + LRU_ACTIVE); + inactive_zone = lruvec_zone_lru_size(lruvec, file * LRU_FILE, zid); + active_zone = lruvec_zone_lru_size(lruvec, (file * LRU_FILE) + LRU_ACTIVE, zid); inactive -= min(inactive, inactive_zone); active -= min(active, active_zone);