Message ID | 1425306313-7234-4-git-send-email-daniel.wagner@bmw-carit.de (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Mon, 2 Mar 2015 15:25:12 +0100 Daniel Wagner <daniel.wagner@bmw-carit.de> wrote: > The locks_insert/delete_block() functions are used for flock, posix > and leases types. blocked_lock_lock is used to serialize all access to > fl_link, fl_block, fl_next and blocked_hash. Here, we prepare the > stage for using blocked_lock_lock to protect blocked_hash. > > Signed-off-by: Daniel Wagner <daniel.wagner@bmw-carit.de> > Cc: Jeff Layton <jlayton@poochiereds.net> > Cc: "J. Bruce Fields" <bfields@fieldses.org> > Cc: Alexander Viro <viro@zeniv.linux.org.uk> > --- > fs/locks.c | 48 ++++++++++++++++++++++++++++++++++++++++-------- > 1 file changed, 40 insertions(+), 8 deletions(-) > > diff --git a/fs/locks.c b/fs/locks.c > index 4498da0..02821dd 100644 > --- a/fs/locks.c > +++ b/fs/locks.c > @@ -611,11 +611,20 @@ static void locks_delete_global_blocked(struct file_lock *waiter) > */ > static void __locks_delete_block(struct file_lock *waiter) > { > - locks_delete_global_blocked(waiter); > list_del_init(&waiter->fl_block); > waiter->fl_next = NULL; > } > > +/* Posix block variant of __locks_delete_block. > + * > + * Must be called with blocked_lock_lock held. > + */ > +static void __locks_delete_posix_block(struct file_lock *waiter) > +{ > + locks_delete_global_blocked(waiter); > + __locks_delete_block(waiter); > +} > + > static void locks_delete_block(struct file_lock *waiter) > { > spin_lock(&blocked_lock_lock); > @@ -623,6 +632,13 @@ static void locks_delete_block(struct file_lock *waiter) > spin_unlock(&blocked_lock_lock); > } > > +static void locks_delete_posix_block(struct file_lock *waiter) > +{ > + spin_lock(&blocked_lock_lock); > + __locks_delete_posix_block(waiter); > + spin_unlock(&blocked_lock_lock); > +} > + > /* Insert waiter into blocker's block list. > * We use a circular list so that processes can be easily woken up in > * the order they blocked. The documentation doesn't require this but > @@ -639,7 +655,17 @@ static void __locks_insert_block(struct file_lock *blocker, > BUG_ON(!list_empty(&waiter->fl_block)); > waiter->fl_next = blocker; > list_add_tail(&waiter->fl_block, &blocker->fl_block); > - if (IS_POSIX(blocker) && !IS_OFDLCK(blocker)) > +} > + > +/* Posix block variant of __locks_insert_block. > + * > + * Must be called with flc_lock and blocked_lock_lock held. > + */ > +static void __locks_insert_posix_block(struct file_lock *blocker, > + struct file_lock *waiter) > +{ > + __locks_insert_block(blocker, waiter); > + if (!IS_OFDLCK(blocker)) > locks_insert_global_blocked(waiter); > } > In many ways OFD locks act more like flock locks than POSIX ones. In particular, there is no deadlock detection there, so once your conversion is done to more widely use the percpu locks, then you should be able to avoid taking the blocked_lock_lock for OFD locks as well. The 4th patch in this series doesn't currently do that. You may want to revisit this patch such that the IS_OFDLCK checks are done earlier, so that you can avoid taking the blocked_lock_lock if IS_POSIX and !IS_OFDLCK. > @@ -675,7 +701,10 @@ static void locks_wake_up_blocks(struct > file_lock *blocker) > waiter = list_first_entry(&blocker->fl_block, > struct file_lock, fl_block); > - __locks_delete_block(waiter); > + if (IS_POSIX(blocker)) ...again, you probably want to make this read: if (IS_POSIX(blocker) && !IS_OFDLCK(blocker) ...so that later you can avoid taking the blocked_lock_lock if IS_OFDLOCK(blocker). > + __locks_delete_posix_block(waiter); > + else > + __locks_delete_block(waiter); > if (waiter->fl_lmops && waiter->fl_lmops->lm_notify) > waiter->fl_lmops->lm_notify(waiter); > else > @@ -985,7 +1014,7 @@ static int __posix_lock_file(struct inode > *inode, struct file_lock *request, str spin_lock(&blocked_lock_lock); > if (likely(!posix_locks_deadlock(request, > fl))) { error = FILE_LOCK_DEFERRED; > - __locks_insert_block(fl, request); > + __locks_insert_posix_block(fl, > request); } > spin_unlock(&blocked_lock_lock); > goto out; > @@ -1186,7 +1215,7 @@ int posix_lock_file_wait(struct file *filp, > struct file_lock *fl) if (!error) > continue; > > - locks_delete_block(fl); > + locks_delete_posix_block(fl); > break; > } > return error; > @@ -1283,7 +1312,7 @@ int locks_mandatory_area(int read_write, struct > inode *inode, continue; > } > > - locks_delete_block(&fl); > + locks_delete_posix_block(&fl); > break; > } > > @@ -2103,7 +2132,10 @@ static int do_lock_file_wait(struct file > *filp, unsigned int cmd, if (!error) > continue; > > - locks_delete_block(fl); > + if (IS_POSIX(fl)) > + locks_delete_posix_block(fl); > + else > + locks_delete_block(fl); > break; > } > > @@ -2467,7 +2499,7 @@ posix_unblock_lock(struct file_lock *waiter) > > spin_lock(&blocked_lock_lock); > if (waiter->fl_next) > - __locks_delete_block(waiter); > + __locks_delete_posix_block(waiter); > else > status = -ENOENT; > spin_unlock(&blocked_lock_lock);
On 03/03/2015 01:55 AM, Jeff Layton wrote: > On Mon, 2 Mar 2015 15:25:12 +0100 > Daniel Wagner <daniel.wagner@bmw-carit.de> wrote: > >> The locks_insert/delete_block() functions are used for flock, posix >> and leases types. blocked_lock_lock is used to serialize all access to >> fl_link, fl_block, fl_next and blocked_hash. Here, we prepare the >> stage for using blocked_lock_lock to protect blocked_hash. >> >> Signed-off-by: Daniel Wagner <daniel.wagner@bmw-carit.de> >> Cc: Jeff Layton <jlayton@poochiereds.net> >> Cc: "J. Bruce Fields" <bfields@fieldses.org> >> Cc: Alexander Viro <viro@zeniv.linux.org.uk> >> --- >> fs/locks.c | 48 ++++++++++++++++++++++++++++++++++++++++-------- >> 1 file changed, 40 insertions(+), 8 deletions(-) >> >> diff --git a/fs/locks.c b/fs/locks.c >> index 4498da0..02821dd 100644 >> --- a/fs/locks.c >> +++ b/fs/locks.c >> @@ -611,11 +611,20 @@ static void locks_delete_global_blocked(struct file_lock *waiter) >> */ >> static void __locks_delete_block(struct file_lock *waiter) >> { >> - locks_delete_global_blocked(waiter); >> list_del_init(&waiter->fl_block); >> waiter->fl_next = NULL; >> } >> >> +/* Posix block variant of __locks_delete_block. >> + * >> + * Must be called with blocked_lock_lock held. >> + */ >> +static void __locks_delete_posix_block(struct file_lock *waiter) >> +{ >> + locks_delete_global_blocked(waiter); >> + __locks_delete_block(waiter); >> +} >> + >> static void locks_delete_block(struct file_lock *waiter) >> { >> spin_lock(&blocked_lock_lock); >> @@ -623,6 +632,13 @@ static void locks_delete_block(struct file_lock *waiter) >> spin_unlock(&blocked_lock_lock); >> } >> >> +static void locks_delete_posix_block(struct file_lock *waiter) >> +{ >> + spin_lock(&blocked_lock_lock); >> + __locks_delete_posix_block(waiter); >> + spin_unlock(&blocked_lock_lock); >> +} >> + >> /* Insert waiter into blocker's block list. >> * We use a circular list so that processes can be easily woken up in >> * the order they blocked. The documentation doesn't require this but >> @@ -639,7 +655,17 @@ static void __locks_insert_block(struct file_lock *blocker, >> BUG_ON(!list_empty(&waiter->fl_block)); >> waiter->fl_next = blocker; >> list_add_tail(&waiter->fl_block, &blocker->fl_block); >> - if (IS_POSIX(blocker) && !IS_OFDLCK(blocker)) >> +} >> + >> +/* Posix block variant of __locks_insert_block. >> + * >> + * Must be called with flc_lock and blocked_lock_lock held. >> + */ >> +static void __locks_insert_posix_block(struct file_lock *blocker, >> + struct file_lock *waiter) >> +{ >> + __locks_insert_block(blocker, waiter); >> + if (!IS_OFDLCK(blocker)) >> locks_insert_global_blocked(waiter); >> } >> > > In many ways OFD locks act more like flock locks than POSIX ones. In > particular, there is no deadlock detection there, so once your > conversion is done to more widely use the percpu locks, then you should > be able to avoid taking the blocked_lock_lock for OFD locks as well. > The 4th patch in this series doesn't currently do that. > > You may want to revisit this patch such that the IS_OFDLCK checks are > done earlier, so that you can avoid taking the blocked_lock_lock if > IS_POSIX and !IS_OFDLCK. Thanks for the explanation. I was not entirely sure what to do here and forgot to ask. I have fixed that stuff and now I am testing it. Though it seems that there is a memory leak which can be triggered with while true; rm -rf /tmp/a; ./lease02 /tmp/a; done and this happens also without any of my patches. Still trying to figure out what's happening. Hopefully I just see a ghost. slabtop tells me that ftrace_event_field is constantly growing: Active / Total Objects (% used) : 968819303 / 968828665 (100.0%) Active / Total Slabs (% used) : 11404623 / 11404623 (100.0%) Active / Total Caches (% used) : 72 / 99 (72.7%) Active / Total Size (% used) : 45616199.68K / 45619608.73K (100.0%) Minimum / Average / Maximum Object : 0.01K / 0.05K / 16.00K OBJS ACTIVE USE OBJ SIZE SLABS OBJ/SLAB CACHE SIZE NAME 967510630 967510630 2% 0.05K 11382478 85 45529912K ftrace_event_field 154368 154368 100% 0.03K 1206 128 4824K kmalloc-32 121856 121856 100% 0.01K 238 512 952K kmalloc-8 121227 121095 99% 0.08K 2377 51 9508K Acpi-State This is on proper hardware. On a kvm guest, fasync_cache grows fast and finally the guest runs out of memory. systemd tries hard to restart everything and fails constantly: [ 187.021758] systemd invoked oom-killer: gfp_mask=0x200da, order=0, oom_score_adj=0 [ 187.022337] systemd cpuset=/ mems_allowed=0 [ 187.022662] CPU: 3 PID: 1 Comm: systemd Not tainted 4.0.0-rc1+ #380 [ 187.023117] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.7.5-20140709_153950- 04/01/2014 [ 187.023801] ffff88007c918000 ffff88007c9179c8 ffffffff81b4f9be ffffffff8116a9cc [ 187.024373] 0000000000000000 ffff88007c917a88 ffffffff8116a9d1 000000007c917a58 [ 187.024940] ffffffff8224bc98 ffff88007c917a28 0000000000000092 ffffffff81c1b780 [ 187.025515] Call Trace: [ 187.025698] [<ffffffff81b4f9be>] dump_stack+0x4c/0x65 [ 187.026083] [<ffffffff8116a9cc>] ? dump_header.isra.13+0x7c/0x450 [ 187.026525] [<ffffffff8116a9d1>] dump_header.isra.13+0x81/0x450 [ 187.026958] [<ffffffff810a45c6>] ? trace_hardirqs_on_caller+0x16/0x240 [ 187.027437] [<ffffffff810a47fd>] ? trace_hardirqs_on+0xd/0x10 [ 187.027859] [<ffffffff814fe5c4>] ? ___ratelimit+0x84/0x110 [ 187.028264] [<ffffffff8116b378>] oom_kill_process+0x1e8/0x4c0 [ 187.028683] [<ffffffff8105fda5>] ? has_ns_capability_noaudit+0x5/0x170 [ 187.029167] [<ffffffff8116baf4>] __out_of_memory+0x4a4/0x510 [ 187.029579] [<ffffffff8116bd2b>] out_of_memory+0x5b/0x80 [ 187.029970] [<ffffffff81170f2e>] __alloc_pages_nodemask+0xa0e/0xb60 [ 187.030434] [<ffffffff811ad863>] read_swap_cache_async+0xe3/0x180 [ 187.030881] [<ffffffff811ad9ed>] swapin_readahead+0xed/0x190 [ 187.031300] [<ffffffff8119bcae>] handle_mm_fault+0xbbe/0x1180 [ 187.031719] [<ffffffff81046bed>] __do_page_fault+0x1ed/0x4c0 [ 187.032138] [<ffffffff81046ecc>] do_page_fault+0xc/0x10 [ 187.032520] [<ffffffff81b5ddc2>] page_fault+0x22/0x30 [ 187.032889] Mem-Info: [ 187.033066] DMA per-cpu: [ 187.033254] CPU 0: hi: 0, btch: 1 usd: 0 [ 187.033596] CPU 1: hi: 0, btch: 1 usd: 0 [ 187.033941] CPU 2: hi: 0, btch: 1 usd: 0 [ 187.034292] CPU 3: hi: 0, btch: 1 usd: 0 [ 187.034637] DMA32 per-cpu: [ 187.034837] CPU 0: hi: 186, btch: 31 usd: 51 [ 187.035185] CPU 1: hi: 186, btch: 31 usd: 0 [ 187.035529] CPU 2: hi: 186, btch: 31 usd: 0 [ 187.035873] CPU 3: hi: 186, btch: 31 usd: 32 [ 187.036221] active_anon:5 inactive_anon:0 isolated_anon:0 [ 187.036221] active_file:238 inactive_file:194 isolated_file:0 [ 187.036221] unevictable:0 dirty:0 writeback:8 unstable:0 [ 187.036221] free:3361 slab_reclaimable:4651 slab_unreclaimable:493909 [ 187.036221] mapped:347 shmem:0 pagetables:400 bounce:0 [ 187.036221] free_cma:0 [ 187.038385] DMA free:7848kB min:44kB low:52kB high:64kB active_anon:4kB inactive_anon:12kB active_file:0kB inactive_file:4kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:15992kB managed:15908kB mlocked:0kB dirty:0kB writeback:8kB mapped:4kB shmem:0kB slab_reclaimable:12kB slab_unreclaimable:7880kB kernel_stack:32kB pagetables:36kB unstable:0kB bounce:0kB free_cma:0kB writeback_tmp:0kB pages_scanned:132 all_unreclaimable? yes [ 187.041138] lowmem_reserve[]: 0 1952 1952 1952 [ 187.041510] DMA32 free:5596kB min:5628kB low:7032kB high:8440kB active_anon:16kB inactive_anon:0kB active_file:952kB inactive_file:772kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:2080640kB managed:2004912kB mlocked:0kB dirty:0kB writeback:24kB mapped:1384kB shmem:0kB slab_reclaimable:18592kB slab_unreclaimable:1967756kB kernel_stack:1968kB pagetables:1564kB unstable:0kB bounce:0kB free_cma:0kB writeback_tmp:0kB pages_scanned:12716 all_unreclaimable? yes [ 187.044442] lowmem_reserve[]: 0 0 0 0 [ 187.044756] DMA: 4*4kB (UEM) 2*8kB (UM) 5*16kB (UEM) 2*32kB (UE) 2*64kB (EM) 3*128kB (UEM) 2*256kB (EM) 3*512kB (UEM) 3*1024kB (UEM) 1*2048kB (R) 0*4096kB = 7856kB [ 187.046022] DMA32: 190*4kB (UER) 6*8kB (R) 1*16kB (R) 1*32kB (R) 0*64kB 0*128kB 1*256kB (R) 1*512kB (R) 0*1024kB 0*2048kB 1*4096kB (R) = 5720kB [ 187.047128] Node 0 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=2048kB [ 187.047724] 554 total pagecache pages [ 187.047991] 60 pages in swap cache [ 187.048259] Swap cache stats: add 102769, delete 102709, find 75688/136761 [ 187.048748] Free swap = 1041456kB [ 187.048995] Total swap = 1048572kB [ 187.049250] 524158 pages RAM [ 187.049463] 0 pages HighMem/MovableOnly [ 187.049739] 18953 pages reserved [ 187.049974] [ pid ] uid tgid total_vm rss nr_ptes nr_pmds swapents oom_score_adj name [ 187.050587] [ 1293] 0 1293 10283 1 23 2 131 -1000 systemd-udevd [ 187.051253] [ 1660] 0 1660 12793 57 24 2 134 -1000 auditd [ 187.051872] [ 1681] 81 1681 6637 1 18 2 124 -900 dbus-daemon [ 187.052529] [ 1725] 0 1725 20707 0 42 3 216 -1000 sshd [ 187.053146] [ 2344] 0 2344 3257 0 11 2 49 0 systemd-cgroups [ 187.053820] [ 2345] 0 2345 3257 0 11 2 55 0 systemd-cgroups [ 187.054497] [ 2350] 0 2350 3257 0 11 2 35 0 systemd-cgroups [ 187.055175] [ 2352] 0 2352 3257 0 12 2 37 0 systemd-cgroups [ 187.055846] [ 2354] 0 2354 3257 0 11 2 43 0 systemd-cgroups [ 187.056530] [ 2355] 0 2355 3257 0 11 2 40 0 systemd-cgroups [ 187.057212] [ 2356] 0 2356 3257 0 11 2 44 0 systemd-cgroups [ 187.057886] [ 2362] 0 2362 3257 0 11 3 33 0 systemd-cgroups [ 187.058564] [ 2371] 0 2371 3257 0 11 2 33 0 systemd-cgroups [ 187.059244] [ 2372] 0 2372 3257 0 10 2 44 0 systemd-cgroups [ 187.059917] [ 2373] 0 2373 3257 0 11 2 39 0 systemd-cgroups [ 187.060600] [ 2376] 0 2376 3257 0 11 2 34 0 systemd-cgroups [ 187.061280] [ 2377] 0 2377 3257 0 10 2 43 0 systemd-cgroups [ 187.061942] [ 2378] 0 2378 3257 0 12 3 34 0 systemd-cgroups [ 187.062598] [ 2379] 0 2379 27502 0 10 3 33 0 agetty [ 187.063200] [ 2385] 0 2385 3257 0 12 2 44 0 systemd-cgroups [ 187.063859] [ 2390] 0 2390 3257 0 11 2 43 0 systemd-cgroups [ 187.064520] [ 2394] 0 2394 3257 0 11 2 41 0 systemd-cgroups [ 187.065182] [ 2397] 0 2397 3257 0 11 2 43 0 systemd-cgroups [ 187.065833] [ 2402] 0 2402 3257 0 11 2 42 0 systemd-cgroups [ 187.066490] [ 2403] 0 2403 3257 0 11 2 44 0 systemd-cgroups [ 187.067148] [ 2404] 0 2404 27502 0 13 3 30 0 agetty [ 187.067743] [ 2410] 0 2410 3257 0 11 2 43 0 systemd-cgroups [ 187.068407] [ 2413] 0 2413 3257 0 11 2 36 0 systemd-cgroups [ 187.069072] [ 2416] 0 2416 3257 0 11 2 49 0 systemd-cgroups [ 187.069720] [ 2417] 0 2417 11861 173 26 2 334 0 (journald) [ 187.070368] Out of memory: Kill process 2417 ((journald)) score 0 or sacrifice child [ 187.070943] Killed process 2417 ((journald)) total-vm:47444kB, anon-rss:0kB, file-rss:692kB [ 187.513857] systemd[1]: Unit systemd-logind.service entered failed state. [ 188.262477] systemd[1]: Unit systemd-journald.service entered failed state. [ 188.315222] systemd[1]: systemd-logind.service holdoff time over, scheduling restart. [ 188.334194] systemd[1]: Stopping Login Service... [ 188.341556] systemd[1]: Starting Login Service... [ 188.408787] systemd[1]: systemd-journald.service holdoff time over, scheduling restart. [ 189.284506] systemd[1]: Stopping Journal Service... [ 189.330806] systemd[1]: Starting Journal Service... [ 189.384800] systemd[1]: Started Journal Service. cheers, daniel -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 03/04/2015 04:20 PM, Daniel Wagner wrote: > On 03/03/2015 01:55 AM, Jeff Layton wrote: >> On Mon, 2 Mar 2015 15:25:12 +0100 >> Daniel Wagner <daniel.wagner@bmw-carit.de> wrote: >> <> > I have fixed that stuff and now I am testing it. Though it seems > that there is a memory leak which can be triggered with > > while true; rm -rf /tmp/a; ./lease02 /tmp/a; done > > and this happens also without any of my patches. Still trying to > figure out what's happening. Hopefully I just see a ghost. > > slabtop tells me that ftrace_event_field is constantly growing: > check out the Kernel's leak detector it is perfect in showing you what was the exact call stack of the leaked memory. (Tell me if you need the exact Kconfig key to enable it should not be hard to find) Cheers Boaz -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 03/04/2015 04:00 PM, Boaz Harrosh wrote: > On 03/04/2015 04:20 PM, Daniel Wagner wrote: >> On 03/03/2015 01:55 AM, Jeff Layton wrote: >>> On Mon, 2 Mar 2015 15:25:12 +0100 >>> Daniel Wagner <daniel.wagner@bmw-carit.de> wrote: >>> > <> >> I have fixed that stuff and now I am testing it. Though it seems >> that there is a memory leak which can be triggered with >> >> while true; rm -rf /tmp/a; ./lease02 /tmp/a; done >> >> and this happens also without any of my patches. Still trying to >> figure out what's happening. Hopefully I just see a ghost. >> >> slabtop tells me that ftrace_event_field is constantly growing: >> > > check out the Kernel's leak detector it is perfect in showing you > what was the exact call stack of the leaked memory. Thanks for the tip. Will use it in future :) I have done a quick bisect limit the search on fs/locks.c. I suspect that the file_lock_context refactoring is the source of the leak. bisect agrees with me 8634b51f6ca298fb8b07aa4847340764903533ab is the first bad commit commit 8634b51f6ca298fb8b07aa4847340764903533ab Author: Jeff Layton <jlayton@primarydata.com> Date: Fri Jan 16 15:05:55 2015 -0500 locks: convert lease handling to file_lock_context Signed-off-by: Jeff Layton <jlayton@primarydata.com> Acked-by: Christoph Hellwig <hch@lst.de> :040000 040000 4114db9392dc4dadb30664b71a954321e5e87bab 5b9abbaf1808a7c926c09fa2164044e0cc26fd54 M fs :040000 040000 bd569f527a195edf673c4f7d0e80bf356c7f8d1b 6362646e04dd83efc1a9e92877900797ac879e9a M include -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, 4 Mar 2015 16:32:57 +0100 Daniel Wagner <daniel.wagner@bmw-carit.de> wrote: > On 03/04/2015 04:00 PM, Boaz Harrosh wrote: > > On 03/04/2015 04:20 PM, Daniel Wagner wrote: > >> On 03/03/2015 01:55 AM, Jeff Layton wrote: > >>> On Mon, 2 Mar 2015 15:25:12 +0100 > >>> Daniel Wagner <daniel.wagner@bmw-carit.de> wrote: > >>> > > <> > >> I have fixed that stuff and now I am testing it. Though it seems > >> that there is a memory leak which can be triggered with > >> > >> while true; rm -rf /tmp/a; ./lease02 /tmp/a; done > >> > >> and this happens also without any of my patches. Still trying to > >> figure out what's happening. Hopefully I just see a ghost. > >> > >> slabtop tells me that ftrace_event_field is constantly growing: > >> > > > > check out the Kernel's leak detector it is perfect in showing you > > what was the exact call stack of the leaked memory. > > Thanks for the tip. Will use it in future :) > > I have done a quick bisect limit the search on fs/locks.c. > I suspect that the file_lock_context refactoring is the source of the leak. > bisect agrees with me > > > 8634b51f6ca298fb8b07aa4847340764903533ab is the first bad commit > commit 8634b51f6ca298fb8b07aa4847340764903533ab > Author: Jeff Layton <jlayton@primarydata.com> > Date: Fri Jan 16 15:05:55 2015 -0500 > > locks: convert lease handling to file_lock_context > > Signed-off-by: Jeff Layton <jlayton@primarydata.com> > Acked-by: Christoph Hellwig <hch@lst.de> > > :040000 040000 4114db9392dc4dadb30664b71a954321e5e87bab 5b9abbaf1808a7c926c09fa2164044e0cc26fd54 M fs > :040000 040000 bd569f527a195edf673c4f7d0e80bf356c7f8d1b 6362646e04dd83efc1a9e92877900797ac879e9a M include > Thanks. I'll take a look.
On Wed, 4 Mar 2015 12:59:23 -0500 Jeff Layton <jlayton@poochiereds.net> wrote: > On Wed, 4 Mar 2015 16:32:57 +0100 > Daniel Wagner <daniel.wagner@bmw-carit.de> wrote: > > > On 03/04/2015 04:00 PM, Boaz Harrosh wrote: > > > On 03/04/2015 04:20 PM, Daniel Wagner wrote: > > >> On 03/03/2015 01:55 AM, Jeff Layton wrote: > > >>> On Mon, 2 Mar 2015 15:25:12 +0100 > > >>> Daniel Wagner <daniel.wagner@bmw-carit.de> wrote: > > >>> > > > <> > > >> I have fixed that stuff and now I am testing it. Though it seems > > >> that there is a memory leak which can be triggered with > > >> > > >> while true; rm -rf /tmp/a; ./lease02 /tmp/a; done > > >> > > >> and this happens also without any of my patches. Still trying to > > >> figure out what's happening. Hopefully I just see a ghost. > > >> > > >> slabtop tells me that ftrace_event_field is constantly growing: > > >> > > > > > > check out the Kernel's leak detector it is perfect in showing you > > > what was the exact call stack of the leaked memory. > > > > Thanks for the tip. Will use it in future :) > > > > I have done a quick bisect limit the search on fs/locks.c. > > I suspect that the file_lock_context refactoring is the source of the leak. > > bisect agrees with me > > > > > > 8634b51f6ca298fb8b07aa4847340764903533ab is the first bad commit > > commit 8634b51f6ca298fb8b07aa4847340764903533ab > > Author: Jeff Layton <jlayton@primarydata.com> > > Date: Fri Jan 16 15:05:55 2015 -0500 > > > > locks: convert lease handling to file_lock_context > > > > Signed-off-by: Jeff Layton <jlayton@primarydata.com> > > Acked-by: Christoph Hellwig <hch@lst.de> > > > > :040000 040000 4114db9392dc4dadb30664b71a954321e5e87bab 5b9abbaf1808a7c926c09fa2164044e0cc26fd54 M fs > > :040000 040000 bd569f527a195edf673c4f7d0e80bf356c7f8d1b 6362646e04dd83efc1a9e92877900797ac879e9a M include > > > > Thanks. I'll take a look. > Huh. I'm was a bit surprised by this as I didn't really touch how the fasync entries get handled. I added a bit of printk debugging (primitive, I know...) and I see this: [ 458.715319] lease_modify: calling fasync_helper on ffff880035a942d0 So, the fasync_helper getting called on the fasync entry, but it's definitely not getting freed. When I look at the object in the debugger, it looks like call_rcu has been called on it though: fa_file = 0x0, fa_rcu = { next = 0xffff8800ccd6a8a0, func = 0xffffffff8122b1c0 <fasync_free_rcu> } ...it's almost like the rcu grace period isn't ending properly? I'll keep poking at though to see if I can figure out what's going wrong.
On Wed, 4 Mar 2015 15:20:33 +0100 Daniel Wagner <daniel.wagner@bmw-carit.de> wrote: > On 03/03/2015 01:55 AM, Jeff Layton wrote: > > On Mon, 2 Mar 2015 15:25:12 +0100 > > Daniel Wagner <daniel.wagner@bmw-carit.de> wrote: > > > >> The locks_insert/delete_block() functions are used for flock, posix > >> and leases types. blocked_lock_lock is used to serialize all access to > >> fl_link, fl_block, fl_next and blocked_hash. Here, we prepare the > >> stage for using blocked_lock_lock to protect blocked_hash. > >> > >> Signed-off-by: Daniel Wagner <daniel.wagner@bmw-carit.de> > >> Cc: Jeff Layton <jlayton@poochiereds.net> > >> Cc: "J. Bruce Fields" <bfields@fieldses.org> > >> Cc: Alexander Viro <viro@zeniv.linux.org.uk> > >> --- > >> fs/locks.c | 48 ++++++++++++++++++++++++++++++++++++++++-------- > >> 1 file changed, 40 insertions(+), 8 deletions(-) > >> > >> diff --git a/fs/locks.c b/fs/locks.c > >> index 4498da0..02821dd 100644 > >> --- a/fs/locks.c > >> +++ b/fs/locks.c > >> @@ -611,11 +611,20 @@ static void locks_delete_global_blocked(struct file_lock *waiter) > >> */ > >> static void __locks_delete_block(struct file_lock *waiter) > >> { > >> - locks_delete_global_blocked(waiter); > >> list_del_init(&waiter->fl_block); > >> waiter->fl_next = NULL; > >> } > >> > >> +/* Posix block variant of __locks_delete_block. > >> + * > >> + * Must be called with blocked_lock_lock held. > >> + */ > >> +static void __locks_delete_posix_block(struct file_lock *waiter) > >> +{ > >> + locks_delete_global_blocked(waiter); > >> + __locks_delete_block(waiter); > >> +} > >> + > >> static void locks_delete_block(struct file_lock *waiter) > >> { > >> spin_lock(&blocked_lock_lock); > >> @@ -623,6 +632,13 @@ static void locks_delete_block(struct file_lock *waiter) > >> spin_unlock(&blocked_lock_lock); > >> } > >> > >> +static void locks_delete_posix_block(struct file_lock *waiter) > >> +{ > >> + spin_lock(&blocked_lock_lock); > >> + __locks_delete_posix_block(waiter); > >> + spin_unlock(&blocked_lock_lock); > >> +} > >> + > >> /* Insert waiter into blocker's block list. > >> * We use a circular list so that processes can be easily woken up in > >> * the order they blocked. The documentation doesn't require this but > >> @@ -639,7 +655,17 @@ static void __locks_insert_block(struct file_lock *blocker, > >> BUG_ON(!list_empty(&waiter->fl_block)); > >> waiter->fl_next = blocker; > >> list_add_tail(&waiter->fl_block, &blocker->fl_block); > >> - if (IS_POSIX(blocker) && !IS_OFDLCK(blocker)) > >> +} > >> + > >> +/* Posix block variant of __locks_insert_block. > >> + * > >> + * Must be called with flc_lock and blocked_lock_lock held. > >> + */ > >> +static void __locks_insert_posix_block(struct file_lock *blocker, > >> + struct file_lock *waiter) > >> +{ > >> + __locks_insert_block(blocker, waiter); > >> + if (!IS_OFDLCK(blocker)) > >> locks_insert_global_blocked(waiter); > >> } > >> > > > > In many ways OFD locks act more like flock locks than POSIX ones. In > > particular, there is no deadlock detection there, so once your > > conversion is done to more widely use the percpu locks, then you should > > be able to avoid taking the blocked_lock_lock for OFD locks as well. > > The 4th patch in this series doesn't currently do that. > > > > You may want to revisit this patch such that the IS_OFDLCK checks are > > done earlier, so that you can avoid taking the blocked_lock_lock if > > IS_POSIX and !IS_OFDLCK. > > Thanks for the explanation. I was not entirely sure what to do here > and forgot to ask. > > I have fixed that stuff and now I am testing it. Though it seems > that there is a memory leak which can be triggered with > > while true; rm -rf /tmp/a; ./lease02 /tmp/a; done > > and this happens also without any of my patches. Still trying to > figure out what's happening. Hopefully I just see a ghost. > > slabtop tells me that ftrace_event_field is constantly growing: > > Active / Total Objects (% used) : 968819303 / 968828665 (100.0%) > Active / Total Slabs (% used) : 11404623 / 11404623 (100.0%) > Active / Total Caches (% used) : 72 / 99 (72.7%) > Active / Total Size (% used) : 45616199.68K / 45619608.73K (100.0%) > Minimum / Average / Maximum Object : 0.01K / 0.05K / 16.00K > > OBJS ACTIVE USE OBJ SIZE SLABS OBJ/SLAB CACHE SIZE NAME > 967510630 967510630 2% 0.05K 11382478 85 45529912K ftrace_event_field > 154368 154368 100% 0.03K 1206 128 4824K kmalloc-32 > 121856 121856 100% 0.01K 238 512 952K kmalloc-8 > 121227 121095 99% 0.08K 2377 51 9508K Acpi-State > > This is on proper hardware. On a kvm guest, fasync_cache grows fast and finally the > guest runs out of memory. systemd tries hard to restart everything and fails constantly: > > [ 187.021758] systemd invoked oom-killer: gfp_mask=0x200da, order=0, oom_score_adj=0 > [ 187.022337] systemd cpuset=/ mems_allowed=0 > [ 187.022662] CPU: 3 PID: 1 Comm: systemd Not tainted 4.0.0-rc1+ #380 > [ 187.023117] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.7.5-20140709_153950- 04/01/2014 > [ 187.023801] ffff88007c918000 ffff88007c9179c8 ffffffff81b4f9be ffffffff8116a9cc > [ 187.024373] 0000000000000000 ffff88007c917a88 ffffffff8116a9d1 000000007c917a58 > [ 187.024940] ffffffff8224bc98 ffff88007c917a28 0000000000000092 ffffffff81c1b780 > [ 187.025515] Call Trace: > [ 187.025698] [<ffffffff81b4f9be>] dump_stack+0x4c/0x65 > [ 187.026083] [<ffffffff8116a9cc>] ? dump_header.isra.13+0x7c/0x450 > [ 187.026525] [<ffffffff8116a9d1>] dump_header.isra.13+0x81/0x450 > [ 187.026958] [<ffffffff810a45c6>] ? trace_hardirqs_on_caller+0x16/0x240 > [ 187.027437] [<ffffffff810a47fd>] ? trace_hardirqs_on+0xd/0x10 > [ 187.027859] [<ffffffff814fe5c4>] ? ___ratelimit+0x84/0x110 > [ 187.028264] [<ffffffff8116b378>] oom_kill_process+0x1e8/0x4c0 > [ 187.028683] [<ffffffff8105fda5>] ? has_ns_capability_noaudit+0x5/0x170 > [ 187.029167] [<ffffffff8116baf4>] __out_of_memory+0x4a4/0x510 > [ 187.029579] [<ffffffff8116bd2b>] out_of_memory+0x5b/0x80 > [ 187.029970] [<ffffffff81170f2e>] __alloc_pages_nodemask+0xa0e/0xb60 > [ 187.030434] [<ffffffff811ad863>] read_swap_cache_async+0xe3/0x180 > [ 187.030881] [<ffffffff811ad9ed>] swapin_readahead+0xed/0x190 > [ 187.031300] [<ffffffff8119bcae>] handle_mm_fault+0xbbe/0x1180 > [ 187.031719] [<ffffffff81046bed>] __do_page_fault+0x1ed/0x4c0 > [ 187.032138] [<ffffffff81046ecc>] do_page_fault+0xc/0x10 > [ 187.032520] [<ffffffff81b5ddc2>] page_fault+0x22/0x30 > [ 187.032889] Mem-Info: > [ 187.033066] DMA per-cpu: > [ 187.033254] CPU 0: hi: 0, btch: 1 usd: 0 > [ 187.033596] CPU 1: hi: 0, btch: 1 usd: 0 > [ 187.033941] CPU 2: hi: 0, btch: 1 usd: 0 > [ 187.034292] CPU 3: hi: 0, btch: 1 usd: 0 > [ 187.034637] DMA32 per-cpu: > [ 187.034837] CPU 0: hi: 186, btch: 31 usd: 51 > [ 187.035185] CPU 1: hi: 186, btch: 31 usd: 0 > [ 187.035529] CPU 2: hi: 186, btch: 31 usd: 0 > [ 187.035873] CPU 3: hi: 186, btch: 31 usd: 32 > [ 187.036221] active_anon:5 inactive_anon:0 isolated_anon:0 > [ 187.036221] active_file:238 inactive_file:194 isolated_file:0 > [ 187.036221] unevictable:0 dirty:0 writeback:8 unstable:0 > [ 187.036221] free:3361 slab_reclaimable:4651 slab_unreclaimable:493909 > [ 187.036221] mapped:347 shmem:0 pagetables:400 bounce:0 > [ 187.036221] free_cma:0 > [ 187.038385] DMA free:7848kB min:44kB low:52kB high:64kB active_anon:4kB inactive_anon:12kB active_file:0kB inactive_file:4kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:15992kB managed:15908kB mlocked:0kB dirty:0kB writeback:8kB mapped:4kB shmem:0kB slab_reclaimable:12kB slab_unreclaimable:7880kB kernel_stack:32kB pagetables:36kB unstable:0kB bounce:0kB free_cma:0kB writeback_tmp:0kB pages_scanned:132 all_unreclaimable? yes > [ 187.041138] lowmem_reserve[]: 0 1952 1952 1952 > [ 187.041510] DMA32 free:5596kB min:5628kB low:7032kB high:8440kB active_anon:16kB inactive_anon:0kB active_file:952kB inactive_file:772kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:2080640kB managed:2004912kB mlocked:0kB dirty:0kB writeback:24kB mapped:1384kB shmem:0kB slab_reclaimable:18592kB slab_unreclaimable:1967756kB kernel_stack:1968kB pagetables:1564kB unstable:0kB bounce:0kB free_cma:0kB writeback_tmp:0kB pages_scanned:12716 all_unreclaimable? yes > [ 187.044442] lowmem_reserve[]: 0 0 0 0 > [ 187.044756] DMA: 4*4kB (UEM) 2*8kB (UM) 5*16kB (UEM) 2*32kB (UE) 2*64kB (EM) 3*128kB (UEM) 2*256kB (EM) 3*512kB (UEM) 3*1024kB (UEM) 1*2048kB (R) 0*4096kB = 7856kB > [ 187.046022] DMA32: 190*4kB (UER) 6*8kB (R) 1*16kB (R) 1*32kB (R) 0*64kB 0*128kB 1*256kB (R) 1*512kB (R) 0*1024kB 0*2048kB 1*4096kB (R) = 5720kB > [ 187.047128] Node 0 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=2048kB > [ 187.047724] 554 total pagecache pages > [ 187.047991] 60 pages in swap cache > [ 187.048259] Swap cache stats: add 102769, delete 102709, find 75688/136761 > [ 187.048748] Free swap = 1041456kB > [ 187.048995] Total swap = 1048572kB > [ 187.049250] 524158 pages RAM > [ 187.049463] 0 pages HighMem/MovableOnly > [ 187.049739] 18953 pages reserved > [ 187.049974] [ pid ] uid tgid total_vm rss nr_ptes nr_pmds swapents oom_score_adj name > [ 187.050587] [ 1293] 0 1293 10283 1 23 2 131 -1000 systemd-udevd > [ 187.051253] [ 1660] 0 1660 12793 57 24 2 134 -1000 auditd > [ 187.051872] [ 1681] 81 1681 6637 1 18 2 124 -900 dbus-daemon > [ 187.052529] [ 1725] 0 1725 20707 0 42 3 216 -1000 sshd > [ 187.053146] [ 2344] 0 2344 3257 0 11 2 49 0 systemd-cgroups > [ 187.053820] [ 2345] 0 2345 3257 0 11 2 55 0 systemd-cgroups > [ 187.054497] [ 2350] 0 2350 3257 0 11 2 35 0 systemd-cgroups > [ 187.055175] [ 2352] 0 2352 3257 0 12 2 37 0 systemd-cgroups > [ 187.055846] [ 2354] 0 2354 3257 0 11 2 43 0 systemd-cgroups > [ 187.056530] [ 2355] 0 2355 3257 0 11 2 40 0 systemd-cgroups > [ 187.057212] [ 2356] 0 2356 3257 0 11 2 44 0 systemd-cgroups > [ 187.057886] [ 2362] 0 2362 3257 0 11 3 33 0 systemd-cgroups > [ 187.058564] [ 2371] 0 2371 3257 0 11 2 33 0 systemd-cgroups > [ 187.059244] [ 2372] 0 2372 3257 0 10 2 44 0 systemd-cgroups > [ 187.059917] [ 2373] 0 2373 3257 0 11 2 39 0 systemd-cgroups > [ 187.060600] [ 2376] 0 2376 3257 0 11 2 34 0 systemd-cgroups > [ 187.061280] [ 2377] 0 2377 3257 0 10 2 43 0 systemd-cgroups > [ 187.061942] [ 2378] 0 2378 3257 0 12 3 34 0 systemd-cgroups > [ 187.062598] [ 2379] 0 2379 27502 0 10 3 33 0 agetty > [ 187.063200] [ 2385] 0 2385 3257 0 12 2 44 0 systemd-cgroups > [ 187.063859] [ 2390] 0 2390 3257 0 11 2 43 0 systemd-cgroups > [ 187.064520] [ 2394] 0 2394 3257 0 11 2 41 0 systemd-cgroups > [ 187.065182] [ 2397] 0 2397 3257 0 11 2 43 0 systemd-cgroups > [ 187.065833] [ 2402] 0 2402 3257 0 11 2 42 0 systemd-cgroups > [ 187.066490] [ 2403] 0 2403 3257 0 11 2 44 0 systemd-cgroups > [ 187.067148] [ 2404] 0 2404 27502 0 13 3 30 0 agetty > [ 187.067743] [ 2410] 0 2410 3257 0 11 2 43 0 systemd-cgroups > [ 187.068407] [ 2413] 0 2413 3257 0 11 2 36 0 systemd-cgroups > [ 187.069072] [ 2416] 0 2416 3257 0 11 2 49 0 systemd-cgroups > [ 187.069720] [ 2417] 0 2417 11861 173 26 2 334 0 (journald) > [ 187.070368] Out of memory: Kill process 2417 ((journald)) score 0 or sacrifice child > [ 187.070943] Killed process 2417 ((journald)) total-vm:47444kB, anon-rss:0kB, file-rss:692kB > [ 187.513857] systemd[1]: Unit systemd-logind.service entered failed state. > [ 188.262477] systemd[1]: Unit systemd-journald.service entered failed state. > [ 188.315222] systemd[1]: systemd-logind.service holdoff time over, scheduling restart. > [ 188.334194] systemd[1]: Stopping Login Service... > [ 188.341556] systemd[1]: Starting Login Service... > [ 188.408787] systemd[1]: systemd-journald.service holdoff time over, scheduling restart. > [ 189.284506] systemd[1]: Stopping Journal Service... > [ 189.330806] systemd[1]: Starting Journal Service... > [ 189.384800] systemd[1]: Started Journal Service. > > > cheers, > daniel > I pulled down the most recent Fedora rawhide kernel today: 4.0.0-0.rc2.git0.1.fc23.x86_64 ...and with that, I can't reproduce this. The ftrace_event_field slab (which is shared by the fasync_struct cache) seems to stay under control. I see it hover around 3-4M in size while the test is running but the box isn't falling over or anything. Perhaps this was an MM or RCU bug that is now fixed? Can you confirm whether you're still able to reproduce it with the most recent mainline kernels?
On Wed, 4 Mar 2015 16:01:36 -0500 Jeff Layton <jlayton@poochiereds.net> wrote: > On Wed, 4 Mar 2015 15:20:33 +0100 > Daniel Wagner <daniel.wagner@bmw-carit.de> wrote: > > > On 03/03/2015 01:55 AM, Jeff Layton wrote: > > > On Mon, 2 Mar 2015 15:25:12 +0100 > > > Daniel Wagner <daniel.wagner@bmw-carit.de> wrote: > > > > > >> The locks_insert/delete_block() functions are used for flock, posix > > >> and leases types. blocked_lock_lock is used to serialize all access to > > >> fl_link, fl_block, fl_next and blocked_hash. Here, we prepare the > > >> stage for using blocked_lock_lock to protect blocked_hash. > > >> > > >> Signed-off-by: Daniel Wagner <daniel.wagner@bmw-carit.de> > > >> Cc: Jeff Layton <jlayton@poochiereds.net> > > >> Cc: "J. Bruce Fields" <bfields@fieldses.org> > > >> Cc: Alexander Viro <viro@zeniv.linux.org.uk> > > >> --- > > >> fs/locks.c | 48 ++++++++++++++++++++++++++++++++++++++++-------- > > >> 1 file changed, 40 insertions(+), 8 deletions(-) > > >> > > >> diff --git a/fs/locks.c b/fs/locks.c > > >> index 4498da0..02821dd 100644 > > >> --- a/fs/locks.c > > >> +++ b/fs/locks.c > > >> @@ -611,11 +611,20 @@ static void locks_delete_global_blocked(struct file_lock *waiter) > > >> */ > > >> static void __locks_delete_block(struct file_lock *waiter) > > >> { > > >> - locks_delete_global_blocked(waiter); > > >> list_del_init(&waiter->fl_block); > > >> waiter->fl_next = NULL; > > >> } > > >> > > >> +/* Posix block variant of __locks_delete_block. > > >> + * > > >> + * Must be called with blocked_lock_lock held. > > >> + */ > > >> +static void __locks_delete_posix_block(struct file_lock *waiter) > > >> +{ > > >> + locks_delete_global_blocked(waiter); > > >> + __locks_delete_block(waiter); > > >> +} > > >> + > > >> static void locks_delete_block(struct file_lock *waiter) > > >> { > > >> spin_lock(&blocked_lock_lock); > > >> @@ -623,6 +632,13 @@ static void locks_delete_block(struct file_lock *waiter) > > >> spin_unlock(&blocked_lock_lock); > > >> } > > >> > > >> +static void locks_delete_posix_block(struct file_lock *waiter) > > >> +{ > > >> + spin_lock(&blocked_lock_lock); > > >> + __locks_delete_posix_block(waiter); > > >> + spin_unlock(&blocked_lock_lock); > > >> +} > > >> + > > >> /* Insert waiter into blocker's block list. > > >> * We use a circular list so that processes can be easily woken up in > > >> * the order they blocked. The documentation doesn't require this but > > >> @@ -639,7 +655,17 @@ static void __locks_insert_block(struct file_lock *blocker, > > >> BUG_ON(!list_empty(&waiter->fl_block)); > > >> waiter->fl_next = blocker; > > >> list_add_tail(&waiter->fl_block, &blocker->fl_block); > > >> - if (IS_POSIX(blocker) && !IS_OFDLCK(blocker)) > > >> +} > > >> + > > >> +/* Posix block variant of __locks_insert_block. > > >> + * > > >> + * Must be called with flc_lock and blocked_lock_lock held. > > >> + */ > > >> +static void __locks_insert_posix_block(struct file_lock *blocker, > > >> + struct file_lock *waiter) > > >> +{ > > >> + __locks_insert_block(blocker, waiter); > > >> + if (!IS_OFDLCK(blocker)) > > >> locks_insert_global_blocked(waiter); > > >> } > > >> > > > > > > In many ways OFD locks act more like flock locks than POSIX ones. In > > > particular, there is no deadlock detection there, so once your > > > conversion is done to more widely use the percpu locks, then you should > > > be able to avoid taking the blocked_lock_lock for OFD locks as well. > > > The 4th patch in this series doesn't currently do that. > > > > > > You may want to revisit this patch such that the IS_OFDLCK checks are > > > done earlier, so that you can avoid taking the blocked_lock_lock if > > > IS_POSIX and !IS_OFDLCK. > > > > Thanks for the explanation. I was not entirely sure what to do here > > and forgot to ask. > > > > I have fixed that stuff and now I am testing it. Though it seems > > that there is a memory leak which can be triggered with > > > > while true; rm -rf /tmp/a; ./lease02 /tmp/a; done > > > > and this happens also without any of my patches. Still trying to > > figure out what's happening. Hopefully I just see a ghost. > > > > slabtop tells me that ftrace_event_field is constantly growing: > > > > Active / Total Objects (% used) : 968819303 / 968828665 (100.0%) > > Active / Total Slabs (% used) : 11404623 / 11404623 (100.0%) > > Active / Total Caches (% used) : 72 / 99 (72.7%) > > Active / Total Size (% used) : 45616199.68K / 45619608.73K (100.0%) > > Minimum / Average / Maximum Object : 0.01K / 0.05K / 16.00K > > > > OBJS ACTIVE USE OBJ SIZE SLABS OBJ/SLAB CACHE SIZE NAME > > 967510630 967510630 2% 0.05K 11382478 85 45529912K ftrace_event_field > > 154368 154368 100% 0.03K 1206 128 4824K kmalloc-32 > > 121856 121856 100% 0.01K 238 512 952K kmalloc-8 > > 121227 121095 99% 0.08K 2377 51 9508K Acpi-State > > > > This is on proper hardware. On a kvm guest, fasync_cache grows fast and finally the > > guest runs out of memory. systemd tries hard to restart everything and fails constantly: > > > > [ 187.021758] systemd invoked oom-killer: gfp_mask=0x200da, order=0, oom_score_adj=0 > > [ 187.022337] systemd cpuset=/ mems_allowed=0 > > [ 187.022662] CPU: 3 PID: 1 Comm: systemd Not tainted 4.0.0-rc1+ #380 > > [ 187.023117] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.7.5-20140709_153950- 04/01/2014 > > [ 187.023801] ffff88007c918000 ffff88007c9179c8 ffffffff81b4f9be ffffffff8116a9cc > > [ 187.024373] 0000000000000000 ffff88007c917a88 ffffffff8116a9d1 000000007c917a58 > > [ 187.024940] ffffffff8224bc98 ffff88007c917a28 0000000000000092 ffffffff81c1b780 > > [ 187.025515] Call Trace: > > [ 187.025698] [<ffffffff81b4f9be>] dump_stack+0x4c/0x65 > > [ 187.026083] [<ffffffff8116a9cc>] ? dump_header.isra.13+0x7c/0x450 > > [ 187.026525] [<ffffffff8116a9d1>] dump_header.isra.13+0x81/0x450 > > [ 187.026958] [<ffffffff810a45c6>] ? trace_hardirqs_on_caller+0x16/0x240 > > [ 187.027437] [<ffffffff810a47fd>] ? trace_hardirqs_on+0xd/0x10 > > [ 187.027859] [<ffffffff814fe5c4>] ? ___ratelimit+0x84/0x110 > > [ 187.028264] [<ffffffff8116b378>] oom_kill_process+0x1e8/0x4c0 > > [ 187.028683] [<ffffffff8105fda5>] ? has_ns_capability_noaudit+0x5/0x170 > > [ 187.029167] [<ffffffff8116baf4>] __out_of_memory+0x4a4/0x510 > > [ 187.029579] [<ffffffff8116bd2b>] out_of_memory+0x5b/0x80 > > [ 187.029970] [<ffffffff81170f2e>] __alloc_pages_nodemask+0xa0e/0xb60 > > [ 187.030434] [<ffffffff811ad863>] read_swap_cache_async+0xe3/0x180 > > [ 187.030881] [<ffffffff811ad9ed>] swapin_readahead+0xed/0x190 > > [ 187.031300] [<ffffffff8119bcae>] handle_mm_fault+0xbbe/0x1180 > > [ 187.031719] [<ffffffff81046bed>] __do_page_fault+0x1ed/0x4c0 > > [ 187.032138] [<ffffffff81046ecc>] do_page_fault+0xc/0x10 > > [ 187.032520] [<ffffffff81b5ddc2>] page_fault+0x22/0x30 > > [ 187.032889] Mem-Info: > > [ 187.033066] DMA per-cpu: > > [ 187.033254] CPU 0: hi: 0, btch: 1 usd: 0 > > [ 187.033596] CPU 1: hi: 0, btch: 1 usd: 0 > > [ 187.033941] CPU 2: hi: 0, btch: 1 usd: 0 > > [ 187.034292] CPU 3: hi: 0, btch: 1 usd: 0 > > [ 187.034637] DMA32 per-cpu: > > [ 187.034837] CPU 0: hi: 186, btch: 31 usd: 51 > > [ 187.035185] CPU 1: hi: 186, btch: 31 usd: 0 > > [ 187.035529] CPU 2: hi: 186, btch: 31 usd: 0 > > [ 187.035873] CPU 3: hi: 186, btch: 31 usd: 32 > > [ 187.036221] active_anon:5 inactive_anon:0 isolated_anon:0 > > [ 187.036221] active_file:238 inactive_file:194 isolated_file:0 > > [ 187.036221] unevictable:0 dirty:0 writeback:8 unstable:0 > > [ 187.036221] free:3361 slab_reclaimable:4651 slab_unreclaimable:493909 > > [ 187.036221] mapped:347 shmem:0 pagetables:400 bounce:0 > > [ 187.036221] free_cma:0 > > [ 187.038385] DMA free:7848kB min:44kB low:52kB high:64kB active_anon:4kB inactive_anon:12kB active_file:0kB inactive_file:4kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:15992kB managed:15908kB mlocked:0kB dirty:0kB writeback:8kB mapped:4kB shmem:0kB slab_reclaimable:12kB slab_unreclaimable:7880kB kernel_stack:32kB pagetables:36kB unstable:0kB bounce:0kB free_cma:0kB writeback_tmp:0kB pages_scanned:132 all_unreclaimable? yes > > [ 187.041138] lowmem_reserve[]: 0 1952 1952 1952 > > [ 187.041510] DMA32 free:5596kB min:5628kB low:7032kB high:8440kB active_anon:16kB inactive_anon:0kB active_file:952kB inactive_file:772kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:2080640kB managed:2004912kB mlocked:0kB dirty:0kB writeback:24kB mapped:1384kB shmem:0kB slab_reclaimable:18592kB slab_unreclaimable:1967756kB kernel_stack:1968kB pagetables:1564kB unstable:0kB bounce:0kB free_cma:0kB writeback_tmp:0kB pages_scanned:12716 all_unreclaimable? yes > > [ 187.044442] lowmem_reserve[]: 0 0 0 0 > > [ 187.044756] DMA: 4*4kB (UEM) 2*8kB (UM) 5*16kB (UEM) 2*32kB (UE) 2*64kB (EM) 3*128kB (UEM) 2*256kB (EM) 3*512kB (UEM) 3*1024kB (UEM) 1*2048kB (R) 0*4096kB = 7856kB > > [ 187.046022] DMA32: 190*4kB (UER) 6*8kB (R) 1*16kB (R) 1*32kB (R) 0*64kB 0*128kB 1*256kB (R) 1*512kB (R) 0*1024kB 0*2048kB 1*4096kB (R) = 5720kB > > [ 187.047128] Node 0 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=2048kB > > [ 187.047724] 554 total pagecache pages > > [ 187.047991] 60 pages in swap cache > > [ 187.048259] Swap cache stats: add 102769, delete 102709, find 75688/136761 > > [ 187.048748] Free swap = 1041456kB > > [ 187.048995] Total swap = 1048572kB > > [ 187.049250] 524158 pages RAM > > [ 187.049463] 0 pages HighMem/MovableOnly > > [ 187.049739] 18953 pages reserved > > [ 187.049974] [ pid ] uid tgid total_vm rss nr_ptes nr_pmds swapents oom_score_adj name > > [ 187.050587] [ 1293] 0 1293 10283 1 23 2 131 -1000 systemd-udevd > > [ 187.051253] [ 1660] 0 1660 12793 57 24 2 134 -1000 auditd > > [ 187.051872] [ 1681] 81 1681 6637 1 18 2 124 -900 dbus-daemon > > [ 187.052529] [ 1725] 0 1725 20707 0 42 3 216 -1000 sshd > > [ 187.053146] [ 2344] 0 2344 3257 0 11 2 49 0 systemd-cgroups > > [ 187.053820] [ 2345] 0 2345 3257 0 11 2 55 0 systemd-cgroups > > [ 187.054497] [ 2350] 0 2350 3257 0 11 2 35 0 systemd-cgroups > > [ 187.055175] [ 2352] 0 2352 3257 0 12 2 37 0 systemd-cgroups > > [ 187.055846] [ 2354] 0 2354 3257 0 11 2 43 0 systemd-cgroups > > [ 187.056530] [ 2355] 0 2355 3257 0 11 2 40 0 systemd-cgroups > > [ 187.057212] [ 2356] 0 2356 3257 0 11 2 44 0 systemd-cgroups > > [ 187.057886] [ 2362] 0 2362 3257 0 11 3 33 0 systemd-cgroups > > [ 187.058564] [ 2371] 0 2371 3257 0 11 2 33 0 systemd-cgroups > > [ 187.059244] [ 2372] 0 2372 3257 0 10 2 44 0 systemd-cgroups > > [ 187.059917] [ 2373] 0 2373 3257 0 11 2 39 0 systemd-cgroups > > [ 187.060600] [ 2376] 0 2376 3257 0 11 2 34 0 systemd-cgroups > > [ 187.061280] [ 2377] 0 2377 3257 0 10 2 43 0 systemd-cgroups > > [ 187.061942] [ 2378] 0 2378 3257 0 12 3 34 0 systemd-cgroups > > [ 187.062598] [ 2379] 0 2379 27502 0 10 3 33 0 agetty > > [ 187.063200] [ 2385] 0 2385 3257 0 12 2 44 0 systemd-cgroups > > [ 187.063859] [ 2390] 0 2390 3257 0 11 2 43 0 systemd-cgroups > > [ 187.064520] [ 2394] 0 2394 3257 0 11 2 41 0 systemd-cgroups > > [ 187.065182] [ 2397] 0 2397 3257 0 11 2 43 0 systemd-cgroups > > [ 187.065833] [ 2402] 0 2402 3257 0 11 2 42 0 systemd-cgroups > > [ 187.066490] [ 2403] 0 2403 3257 0 11 2 44 0 systemd-cgroups > > [ 187.067148] [ 2404] 0 2404 27502 0 13 3 30 0 agetty > > [ 187.067743] [ 2410] 0 2410 3257 0 11 2 43 0 systemd-cgroups > > [ 187.068407] [ 2413] 0 2413 3257 0 11 2 36 0 systemd-cgroups > > [ 187.069072] [ 2416] 0 2416 3257 0 11 2 49 0 systemd-cgroups > > [ 187.069720] [ 2417] 0 2417 11861 173 26 2 334 0 (journald) > > [ 187.070368] Out of memory: Kill process 2417 ((journald)) score 0 or sacrifice child > > [ 187.070943] Killed process 2417 ((journald)) total-vm:47444kB, anon-rss:0kB, file-rss:692kB > > [ 187.513857] systemd[1]: Unit systemd-logind.service entered failed state. > > [ 188.262477] systemd[1]: Unit systemd-journald.service entered failed state. > > [ 188.315222] systemd[1]: systemd-logind.service holdoff time over, scheduling restart. > > [ 188.334194] systemd[1]: Stopping Login Service... > > [ 188.341556] systemd[1]: Starting Login Service... > > [ 188.408787] systemd[1]: systemd-journald.service holdoff time over, scheduling restart. > > [ 189.284506] systemd[1]: Stopping Journal Service... > > [ 189.330806] systemd[1]: Starting Journal Service... > > [ 189.384800] systemd[1]: Started Journal Service. > > > > > > cheers, > > daniel > > > > I pulled down the most recent Fedora rawhide kernel today: > > 4.0.0-0.rc2.git0.1.fc23.x86_64 > > ...and with that, I can't reproduce this. The ftrace_event_field slab > (which is shared by the fasync_struct cache) seems to stay under > control. I see it hover around 3-4M in size while the test is running > but the box isn't falling over or anything. > > Perhaps this was an MM or RCU bug that is now fixed? Can you confirm > whether you're still able to reproduce it with the most recent mainline > kernels? > Oh! I take it back. I was testing with your patched lease02 test. With the unpatched one I can definitely still reproduce it. That suggests that this problem is occurring when we go to clean up leases when closing the file. I'll have a close look at that code.
On 03/04/2015 10:01 PM, Jeff Layton wrote: > I pulled down the most recent Fedora rawhide kernel today: > > 4.0.0-0.rc2.git0.1.fc23.x86_64 > > ...and with that, I can't reproduce this. The ftrace_event_field slab > (which is shared by the fasync_struct cache) seems to stay under > control. I see it hover around 3-4M in size while the test is running > but the box isn't falling over or anything. > > Perhaps this was an MM or RCU bug that is now fixed? Can you confirm > whether you're still able to reproduce it with the most recent mainline > kernels? Sure, I'll give tomorrow a spin. -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/fs/locks.c b/fs/locks.c index 4498da0..02821dd 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -611,11 +611,20 @@ static void locks_delete_global_blocked(struct file_lock *waiter) */ static void __locks_delete_block(struct file_lock *waiter) { - locks_delete_global_blocked(waiter); list_del_init(&waiter->fl_block); waiter->fl_next = NULL; } +/* Posix block variant of __locks_delete_block. + * + * Must be called with blocked_lock_lock held. + */ +static void __locks_delete_posix_block(struct file_lock *waiter) +{ + locks_delete_global_blocked(waiter); + __locks_delete_block(waiter); +} + static void locks_delete_block(struct file_lock *waiter) { spin_lock(&blocked_lock_lock); @@ -623,6 +632,13 @@ static void locks_delete_block(struct file_lock *waiter) spin_unlock(&blocked_lock_lock); } +static void locks_delete_posix_block(struct file_lock *waiter) +{ + spin_lock(&blocked_lock_lock); + __locks_delete_posix_block(waiter); + spin_unlock(&blocked_lock_lock); +} + /* Insert waiter into blocker's block list. * We use a circular list so that processes can be easily woken up in * the order they blocked. The documentation doesn't require this but @@ -639,7 +655,17 @@ static void __locks_insert_block(struct file_lock *blocker, BUG_ON(!list_empty(&waiter->fl_block)); waiter->fl_next = blocker; list_add_tail(&waiter->fl_block, &blocker->fl_block); - if (IS_POSIX(blocker) && !IS_OFDLCK(blocker)) +} + +/* Posix block variant of __locks_insert_block. + * + * Must be called with flc_lock and blocked_lock_lock held. + */ +static void __locks_insert_posix_block(struct file_lock *blocker, + struct file_lock *waiter) +{ + __locks_insert_block(blocker, waiter); + if (!IS_OFDLCK(blocker)) locks_insert_global_blocked(waiter); } @@ -675,7 +701,10 @@ static void locks_wake_up_blocks(struct file_lock *blocker) waiter = list_first_entry(&blocker->fl_block, struct file_lock, fl_block); - __locks_delete_block(waiter); + if (IS_POSIX(blocker)) + __locks_delete_posix_block(waiter); + else + __locks_delete_block(waiter); if (waiter->fl_lmops && waiter->fl_lmops->lm_notify) waiter->fl_lmops->lm_notify(waiter); else @@ -985,7 +1014,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str spin_lock(&blocked_lock_lock); if (likely(!posix_locks_deadlock(request, fl))) { error = FILE_LOCK_DEFERRED; - __locks_insert_block(fl, request); + __locks_insert_posix_block(fl, request); } spin_unlock(&blocked_lock_lock); goto out; @@ -1186,7 +1215,7 @@ int posix_lock_file_wait(struct file *filp, struct file_lock *fl) if (!error) continue; - locks_delete_block(fl); + locks_delete_posix_block(fl); break; } return error; @@ -1283,7 +1312,7 @@ int locks_mandatory_area(int read_write, struct inode *inode, continue; } - locks_delete_block(&fl); + locks_delete_posix_block(&fl); break; } @@ -2103,7 +2132,10 @@ static int do_lock_file_wait(struct file *filp, unsigned int cmd, if (!error) continue; - locks_delete_block(fl); + if (IS_POSIX(fl)) + locks_delete_posix_block(fl); + else + locks_delete_block(fl); break; } @@ -2467,7 +2499,7 @@ posix_unblock_lock(struct file_lock *waiter) spin_lock(&blocked_lock_lock); if (waiter->fl_next) - __locks_delete_block(waiter); + __locks_delete_posix_block(waiter); else status = -ENOENT; spin_unlock(&blocked_lock_lock);
The locks_insert/delete_block() functions are used for flock, posix and leases types. blocked_lock_lock is used to serialize all access to fl_link, fl_block, fl_next and blocked_hash. Here, we prepare the stage for using blocked_lock_lock to protect blocked_hash. Signed-off-by: Daniel Wagner <daniel.wagner@bmw-carit.de> Cc: Jeff Layton <jlayton@poochiereds.net> Cc: "J. Bruce Fields" <bfields@fieldses.org> Cc: Alexander Viro <viro@zeniv.linux.org.uk> --- fs/locks.c | 48 ++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 40 insertions(+), 8 deletions(-)