Message ID | 20191030025906.14395-2-lsahlber@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | cifs: move cifsFileInfo_put logic into a work-queue | expand |
вт, 29 окт. 2019 г. в 19:59, Ronnie Sahlberg <lsahlber@redhat.com>: > > This patch moves the final part of the cifsFileInfo_put() logic where we > need a write lock on lock_sem to be processed in a separate thread that > holds no other locks. > This is to prevent deadlocks like the one below: > > > there are 6 processes looping to while trying to down_write > > cinode->lock_sem, 5 of them from _cifsFileInfo_put, and one from > > cifs_new_fileinfo > > > > and there are 5 other processes which are blocked, several of them > > waiting on either PG_writeback or PG_locked (which are both set), all > > for the same page of the file > > > > 2 inode_lock() (inode->i_rwsem) for the file > > 1 wait_on_page_writeback() for the page > > 1 down_read(inode->i_rwsem) for the inode of the directory > > 1 inode_lock()(inode->i_rwsem) for the inode of the directory > > 1 __lock_page > > > > > > so processes are blocked waiting on: > > page flags PG_locked and PG_writeback for one specific page > > inode->i_rwsem for the directory > > inode->i_rwsem for the file > > cifsInodeInflock_sem > > > > > > > > here are the more gory details (let me know if I need to provide > > anything more/better): > > > > [0 00:48:22.765] [UN] PID: 8863 TASK: ffff8c691547c5c0 CPU: 3 > > COMMAND: "reopen_file" > > #0 [ffff9965007e3ba8] __schedule at ffffffff9b6e6095 > > #1 [ffff9965007e3c38] schedule at ffffffff9b6e64df > > #2 [ffff9965007e3c48] rwsem_down_write_slowpath at ffffffff9af283d7 > > #3 [ffff9965007e3cb8] legitimize_path at ffffffff9b0f975d > > #4 [ffff9965007e3d08] path_openat at ffffffff9b0fe55d > > #5 [ffff9965007e3dd8] do_filp_open at ffffffff9b100a33 > > #6 [ffff9965007e3ee0] do_sys_open at ffffffff9b0eb2d6 > > #7 [ffff9965007e3f38] do_syscall_64 at ffffffff9ae04315 > > * (I think legitimize_path is bogus) > > > > in path_openat > > } else { > > const char *s = path_init(nd, flags); > > while (!(error = link_path_walk(s, nd)) && > > (error = do_last(nd, file, op)) > 0) { <<<< > > > > do_last: > > if (open_flag & O_CREAT) > > inode_lock(dir->d_inode); <<<< > > else > > so it's trying to take inode->i_rwsem for the directory > > > > DENTRY INODE SUPERBLK TYPE PATH > > ffff8c68bb8e79c0 ffff8c691158ef20 ffff8c6915bf9000 DIR /mnt/vm1_smb/ > > inode.i_rwsem is ffff8c691158efc0 > > > > <struct rw_semaphore 0xffff8c691158efc0>: > > owner: <struct task_struct 0xffff8c6914275d00> (UN - 8856 - > > reopen_file), counter: 0x0000000000000003 > > waitlist: 2 > > 0xffff9965007e3c90 8863 reopen_file UN 0 1:29:22.926 > > RWSEM_WAITING_FOR_WRITE > > 0xffff996500393e00 9802 ls UN 0 1:17:26.700 > > RWSEM_WAITING_FOR_READ > > > > > > the owner of the inode.i_rwsem of the directory is: > > > > [0 00:00:00.109] [UN] PID: 8856 TASK: ffff8c6914275d00 CPU: 3 > > COMMAND: "reopen_file" > > #0 [ffff99650065b828] __schedule at ffffffff9b6e6095 > > #1 [ffff99650065b8b8] schedule at ffffffff9b6e64df > > #2 [ffff99650065b8c8] schedule_timeout at ffffffff9b6e9f89 > > #3 [ffff99650065b940] msleep at ffffffff9af573a9 > > #4 [ffff99650065b948] _cifsFileInfo_put.cold.63 at ffffffffc0a42dd6 [cifs] > > #5 [ffff99650065ba38] cifs_writepage_locked at ffffffffc0a0b8f3 [cifs] > > #6 [ffff99650065bab0] cifs_launder_page at ffffffffc0a0bb72 [cifs] > > #7 [ffff99650065bb30] invalidate_inode_pages2_range at ffffffff9b04d4bd > > #8 [ffff99650065bcb8] cifs_invalidate_mapping at ffffffffc0a11339 [cifs] > > #9 [ffff99650065bcd0] cifs_revalidate_mapping at ffffffffc0a1139a [cifs] > > #10 [ffff99650065bcf0] cifs_d_revalidate at ffffffffc0a014f6 [cifs] > > #11 [ffff99650065bd08] path_openat at ffffffff9b0fe7f7 > > #12 [ffff99650065bdd8] do_filp_open at ffffffff9b100a33 > > #13 [ffff99650065bee0] do_sys_open at ffffffff9b0eb2d6 > > #14 [ffff99650065bf38] do_syscall_64 at ffffffff9ae04315 > > > > cifs_launder_page is for page 0xffffd1e2c07d2480 > > > > crash> page.index,mapping,flags 0xffffd1e2c07d2480 > > index = 0x8 > > mapping = 0xffff8c68f3cd0db0 > > flags = 0xfffffc0008095 > > > > PAGE-FLAG BIT VALUE > > PG_locked 0 0000001 > > PG_uptodate 2 0000004 > > PG_lru 4 0000010 > > PG_waiters 7 0000080 > > PG_writeback 15 0008000 > > > > > > inode is ffff8c68f3cd0c40 > > inode.i_rwsem is ffff8c68f3cd0ce0 > > DENTRY INODE SUPERBLK TYPE PATH > > ffff8c68a1f1b480 ffff8c68f3cd0c40 ffff8c6915bf9000 REG > > /mnt/vm1_smb/testfile.8853 > > > > > > this process holds the inode->i_rwsem for the parent directory, is > > laundering a page attached to the inode of the file it's opening, and in > > _cifsFileInfo_put is trying to down_write the cifsInodeInflock_sem > > for the file itself. > > > > > > <struct rw_semaphore 0xffff8c68f3cd0ce0>: > > owner: <struct task_struct 0xffff8c6914272e80> (UN - 8854 - > > reopen_file), counter: 0x0000000000000003 > > waitlist: 1 > > 0xffff9965005dfd80 8855 reopen_file UN 0 1:29:22.912 > > RWSEM_WAITING_FOR_WRITE > > > > this is the inode.i_rwsem for the file > > > > the owner: > > > > [0 00:48:22.739] [UN] PID: 8854 TASK: ffff8c6914272e80 CPU: 2 > > COMMAND: "reopen_file" > > #0 [ffff99650054fb38] __schedule at ffffffff9b6e6095 > > #1 [ffff99650054fbc8] schedule at ffffffff9b6e64df > > #2 [ffff99650054fbd8] io_schedule at ffffffff9b6e68e2 > > #3 [ffff99650054fbe8] __lock_page at ffffffff9b03c56f > > #4 [ffff99650054fc80] pagecache_get_page at ffffffff9b03dcdf > > #5 [ffff99650054fcc0] grab_cache_page_write_begin at ffffffff9b03ef4c > > #6 [ffff99650054fcd0] cifs_write_begin at ffffffffc0a064ec [cifs] > > #7 [ffff99650054fd30] generic_perform_write at ffffffff9b03bba4 > > #8 [ffff99650054fda8] __generic_file_write_iter at ffffffff9b04060a > > #9 [ffff99650054fdf0] cifs_strict_writev.cold.70 at ffffffffc0a4469b [cifs] > > #10 [ffff99650054fe48] new_sync_write at ffffffff9b0ec1dd > > #11 [ffff99650054fed0] vfs_write at ffffffff9b0eed35 > > #12 [ffff99650054ff00] ksys_write at ffffffff9b0eefd9 > > #13 [ffff99650054ff38] do_syscall_64 at ffffffff9ae04315 > > > > the process holds the inode->i_rwsem for the file to which it's writing, > > and is trying to __lock_page for the same page as in the other processes > > > > > > the other tasks: > > [0 00:00:00.028] [UN] PID: 8859 TASK: ffff8c6915479740 CPU: 2 > > COMMAND: "reopen_file" > > #0 [ffff9965007b39d8] __schedule at ffffffff9b6e6095 > > #1 [ffff9965007b3a68] schedule at ffffffff9b6e64df > > #2 [ffff9965007b3a78] schedule_timeout at ffffffff9b6e9f89 > > #3 [ffff9965007b3af0] msleep at ffffffff9af573a9 > > #4 [ffff9965007b3af8] cifs_new_fileinfo.cold.61 at ffffffffc0a42a07 [cifs] > > #5 [ffff9965007b3b78] cifs_open at ffffffffc0a0709d [cifs] > > #6 [ffff9965007b3cd8] do_dentry_open at ffffffff9b0e9b7a > > #7 [ffff9965007b3d08] path_openat at ffffffff9b0fe34f > > #8 [ffff9965007b3dd8] do_filp_open at ffffffff9b100a33 > > #9 [ffff9965007b3ee0] do_sys_open at ffffffff9b0eb2d6 > > #10 [ffff9965007b3f38] do_syscall_64 at ffffffff9ae04315 > > > > this is opening the file, and is trying to down_write cinode->lock_sem > > > > > > [0 00:00:00.041] [UN] PID: 8860 TASK: ffff8c691547ae80 CPU: 2 > > COMMAND: "reopen_file" > > [0 00:00:00.057] [UN] PID: 8861 TASK: ffff8c6915478000 CPU: 3 > > COMMAND: "reopen_file" > > [0 00:00:00.059] [UN] PID: 8858 TASK: ffff8c6914271740 CPU: 2 > > COMMAND: "reopen_file" > > [0 00:00:00.109] [UN] PID: 8862 TASK: ffff8c691547dd00 CPU: 6 > > COMMAND: "reopen_file" > > #0 [ffff9965007c3c78] __schedule at ffffffff9b6e6095 > > #1 [ffff9965007c3d08] schedule at ffffffff9b6e64df > > #2 [ffff9965007c3d18] schedule_timeout at ffffffff9b6e9f89 > > #3 [ffff9965007c3d90] msleep at ffffffff9af573a9 > > #4 [ffff9965007c3d98] _cifsFileInfo_put.cold.63 at ffffffffc0a42dd6 [cifs] > > #5 [ffff9965007c3e88] cifs_close at ffffffffc0a07aaf [cifs] > > #6 [ffff9965007c3ea0] __fput at ffffffff9b0efa6e > > #7 [ffff9965007c3ee8] task_work_run at ffffffff9aef1614 > > #8 [ffff9965007c3f20] exit_to_usermode_loop at ffffffff9ae03d6f > > #9 [ffff9965007c3f38] do_syscall_64 at ffffffff9ae0444c > > > > closing the file, and trying to down_write cifsi->lock_sem > > > > > > [0 00:48:22.839] [UN] PID: 8857 TASK: ffff8c6914270000 CPU: 7 > > COMMAND: "reopen_file" > > #0 [ffff9965006a7cc8] __schedule at ffffffff9b6e6095 > > #1 [ffff9965006a7d58] schedule at ffffffff9b6e64df > > #2 [ffff9965006a7d68] io_schedule at ffffffff9b6e68e2 > > #3 [ffff9965006a7d78] wait_on_page_bit at ffffffff9b03cac6 > > #4 [ffff9965006a7e10] __filemap_fdatawait_range at ffffffff9b03b028 > > #5 [ffff9965006a7ed8] filemap_write_and_wait at ffffffff9b040165 > > #6 [ffff9965006a7ef0] cifs_flush at ffffffffc0a0c2fa [cifs] > > #7 [ffff9965006a7f10] filp_close at ffffffff9b0e93f1 > > #8 [ffff9965006a7f30] __x64_sys_close at ffffffff9b0e9a0e > > #9 [ffff9965006a7f38] do_syscall_64 at ffffffff9ae04315 > > > > in __filemap_fdatawait_range > > wait_on_page_writeback(page); > > for the same page of the file > > > > > > > > [0 00:48:22.718] [UN] PID: 8855 TASK: ffff8c69142745c0 CPU: 7 > > COMMAND: "reopen_file" > > #0 [ffff9965005dfc98] __schedule at ffffffff9b6e6095 > > #1 [ffff9965005dfd28] schedule at ffffffff9b6e64df > > #2 [ffff9965005dfd38] rwsem_down_write_slowpath at ffffffff9af283d7 > > #3 [ffff9965005dfdf0] cifs_strict_writev at ffffffffc0a0c40a [cifs] > > #4 [ffff9965005dfe48] new_sync_write at ffffffff9b0ec1dd > > #5 [ffff9965005dfed0] vfs_write at ffffffff9b0eed35 > > #6 [ffff9965005dff00] ksys_write at ffffffff9b0eefd9 > > #7 [ffff9965005dff38] do_syscall_64 at ffffffff9ae04315 > > > > inode_lock(inode); > > > > > > and one 'ls' later on, to see whether the rest of the mount is available > > (the test file is in the root, so we get blocked up on the directory > > ->i_rwsem), so the entire mount is unavailable > > > > [0 00:36:26.473] [UN] PID: 9802 TASK: ffff8c691436ae80 CPU: 4 > > COMMAND: "ls" > > #0 [ffff996500393d28] __schedule at ffffffff9b6e6095 > > #1 [ffff996500393db8] schedule at ffffffff9b6e64df > > #2 [ffff996500393dc8] rwsem_down_read_slowpath at ffffffff9b6e9421 > > #3 [ffff996500393e78] down_read_killable at ffffffff9b6e95e2 > > #4 [ffff996500393e88] iterate_dir at ffffffff9b103c56 > > #5 [ffff996500393ec8] ksys_getdents64 at ffffffff9b104b0c > > #6 [ffff996500393f30] __x64_sys_getdents64 at ffffffff9b104bb6 > > #7 [ffff996500393f38] do_syscall_64 at ffffffff9ae04315 > > > > in iterate_dir: > > if (shared) > > res = down_read_killable(&inode->i_rwsem); <<<< > > else > > res = down_write_killable(&inode->i_rwsem); > > > > Signed-off-by: Ronnie Sahlberg <lsahlber@redhat.com> > --- > fs/cifs/cifsfs.c | 13 +++++++- > fs/cifs/cifsglob.h | 3 +- > fs/cifs/file.c | 91 ++++++++++++++++++++++++++++++++++-------------------- > 3 files changed, 72 insertions(+), 35 deletions(-) > > diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c > index e4e3b573d20c..f8e201c45ccb 100644 > --- a/fs/cifs/cifsfs.c > +++ b/fs/cifs/cifsfs.c > @@ -119,6 +119,7 @@ extern mempool_t *cifs_mid_poolp; > > struct workqueue_struct *cifsiod_wq; > struct workqueue_struct *decrypt_wq; > +struct workqueue_struct *fileinfo_put_wq; > struct workqueue_struct *cifsoplockd_wq; > __u32 cifs_lock_secret; > > @@ -1557,11 +1558,18 @@ init_cifs(void) > goto out_destroy_cifsiod_wq; > } > > + fileinfo_put_wq = alloc_workqueue("cifsfileinfoput", > + WQ_UNBOUND|WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); > + if (!fileinfo_put_wq) { > + rc = -ENOMEM; > + goto out_destroy_decrypt_wq; > + } > + > cifsoplockd_wq = alloc_workqueue("cifsoplockd", > WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); > if (!cifsoplockd_wq) { > rc = -ENOMEM; > - goto out_destroy_decrypt_wq; > + goto out_destroy_fileinfo_put_wq; > } > > rc = cifs_fscache_register(); > @@ -1627,6 +1635,8 @@ init_cifs(void) > cifs_fscache_unregister(); > out_destroy_cifsoplockd_wq: > destroy_workqueue(cifsoplockd_wq); > +out_destroy_fileinfo_put_wq: > + destroy_workqueue(fileinfo_put_wq); > out_destroy_decrypt_wq: > destroy_workqueue(decrypt_wq); > out_destroy_cifsiod_wq: > @@ -1656,6 +1666,7 @@ exit_cifs(void) > cifs_fscache_unregister(); > destroy_workqueue(cifsoplockd_wq); > destroy_workqueue(decrypt_wq); > + destroy_workqueue(fileinfo_put_wq); > destroy_workqueue(cifsiod_wq); > cifs_proc_clean(); > } > diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h > index d78bfcc19156..0819b6498a3c 100644 > --- a/fs/cifs/cifsglob.h > +++ b/fs/cifs/cifsglob.h > @@ -1265,6 +1265,7 @@ struct cifsFileInfo { > struct mutex fh_mutex; /* prevents reopen race after dead ses*/ > struct cifs_search_info srch_inf; > struct work_struct oplock_break; /* work for oplock breaks */ > + struct work_struct put; /* work for the final part of _put */ > }; > > struct cifs_io_parms { > @@ -1370,7 +1371,6 @@ cifsFileInfo_get_locked(struct cifsFileInfo *cifs_file) > } > > struct cifsFileInfo *cifsFileInfo_get(struct cifsFileInfo *cifs_file); > -void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_hdlr); > void cifsFileInfo_put(struct cifsFileInfo *cifs_file); > > #define CIFS_CACHE_READ_FLG 1 > @@ -1907,6 +1907,7 @@ void cifs_queue_oplock_break(struct cifsFileInfo *cfile); > extern const struct slow_work_ops cifs_oplock_break_ops; > extern struct workqueue_struct *cifsiod_wq; > extern struct workqueue_struct *decrypt_wq; > +extern struct workqueue_struct *fileinfo_put_wq; > extern struct workqueue_struct *cifsoplockd_wq; > extern __u32 cifs_lock_secret; > > diff --git a/fs/cifs/file.c b/fs/cifs/file.c > index 67e7d0ffe385..d1dedb8fb31f 100644 > --- a/fs/cifs/file.c > +++ b/fs/cifs/file.c > @@ -375,30 +375,7 @@ cifsFileInfo_get(struct cifsFileInfo *cifs_file) > return cifs_file; > } > > -/** > - * cifsFileInfo_put - release a reference of file priv data > - * > - * Always potentially wait for oplock handler. See _cifsFileInfo_put(). > - */ > -void cifsFileInfo_put(struct cifsFileInfo *cifs_file) > -{ > - _cifsFileInfo_put(cifs_file, true); > -} > - > -/** > - * _cifsFileInfo_put - release a reference of file priv data > - * > - * This may involve closing the filehandle @cifs_file out on the > - * server. Must be called without holding tcon->open_file_lock and > - * cifs_file->file_info_lock. > - * > - * If @wait_for_oplock_handler is true and we are releasing the last > - * reference, wait for any running oplock break handler of the file > - * and cancel any pending one. If calling this function from the > - * oplock break handler, you need to pass false. > - * > - */ > -void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_handler) > +void cifsFileInfo_put_final(struct cifsFileInfo *cifs_file) > { > struct inode *inode = d_inode(cifs_file->dentry); > struct cifs_tcon *tcon = tlink_tcon(cifs_file->tlink); > @@ -409,7 +386,6 @@ void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_handler) > struct cifsLockInfo *li, *tmp; > struct cifs_fid fid; > struct cifs_pending_open open; > - bool oplock_break_cancelled; > > spin_lock(&tcon->open_file_lock); > spin_lock(&cifsi->open_file_lock); > @@ -449,9 +425,6 @@ void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_handler) > spin_unlock(&cifsi->open_file_lock); > spin_unlock(&tcon->open_file_lock); > > - oplock_break_cancelled = wait_oplock_handler ? > - cancel_work_sync(&cifs_file->oplock_break) : false; > - > if (!tcon->need_reconnect && !cifs_file->invalidHandle) { > struct TCP_Server_Info *server = tcon->ses->server; > unsigned int xid; > @@ -462,9 +435,6 @@ void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_handler) > _free_xid(xid); > } > > - if (oplock_break_cancelled) > - cifs_done_oplock_break(cifsi); > - > cifs_del_pending_open(&open); > > /* > @@ -487,6 +457,61 @@ void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_handler) > kfree(cifs_file); > } > > +static void cifsFileInfo_put_work(struct work_struct *work) > +{ > + struct cifsFileInfo *cifs_file = container_of(work, > + struct cifsFileInfo, put); > + > + cifsFileInfo_put_final(cifs_file); > +} > + > +/** > + * _cifsFileInfo_put - release a reference of file priv data > + * > + * This may involve closing the filehandle @cifs_file out on the > + * server. Must be called without holding tcon->open_file_lock, > + * cinode->open_file_lock and cifs_file->file_info_lock. > + * > + * If @wait_for_oplock_handler is true and we are releasing the last > + * reference, wait for any running oplock break handler of the file > + * and cancel any pending one. If calling this function from the > + * oplock break handler, you need to pass false. > + * > + */ > +void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, > + bool wait_oplock_handler, bool offload) > +{ > + struct inode *inode = d_inode(cifs_file->dentry); > + struct cifsInodeInfo *cifsi = CIFS_I(inode); > + > + spin_lock(&cifs_file->file_info_lock); > + if (cifs_file->count > 1) { > + cifs_file->count--; > + spin_unlock(&cifs_file->file_info_lock); > + return; > + } > + spin_unlock(&cifs_file->file_info_lock); > + > + if (wait_oplock_handler && cancel_work_sync(&cifs_file->oplock_break)) > + cifs_done_oplock_break(cifsi); At this point the file info is still in the lists and can be referenced by other threads not only including oplock break routine. So, we can't cancel oplock break here - it would lead to big timeouts to other clients trying to open the file because the server will be waiting for an oplock break ack. Let's do it in cifsFileInfo_put_final() as it was originally in cifsFileInfo_put() and we do not need an extra *wait_oplock_handler* argument - just cancel it unconditionally. If a caller of this function don't want to deadlock on himself (like cifs_oplock_break) it will just specify *offload* as True. > + > + if (offload) { > + INIT_WORK(&cifs_file->put, cifsFileInfo_put_work); > + queue_work(fileinfo_put_wq, &cifs_file->put); > + } else > + cifsFileInfo_put_final(cifs_file); > +} > + > +/** > + * cifsFileInfo_put - release a reference of file priv data > + * > + * Always potentially wait for oplock handler. See _cifsFileInfo_put(). > + */ > +void cifsFileInfo_put(struct cifsFileInfo *cifs_file) > +{ > + _cifsFileInfo_put(cifs_file, true, true); > +} > + > int cifs_open(struct inode *inode, struct file *file) > > { > @@ -808,7 +833,7 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush) > int cifs_close(struct inode *inode, struct file *file) > { > if (file->private_data != NULL) { > - cifsFileInfo_put(file->private_data); > + _cifsFileInfo_put(file->private_data, true, false); > file->private_data = NULL; > } > > @@ -4742,7 +4767,7 @@ void cifs_oplock_break(struct work_struct *work) > cinode); > cifs_dbg(FYI, "Oplock release rc = %d\n", rc); > } > - _cifsFileInfo_put(cfile, false /* do not wait for ourself */); > + _cifsFileInfo_put(cfile, false /* do not wait for ourself */, true); As mentioned above, here we only need *offload* as True to not wait for ourselves. > cifs_done_oplock_break(cinode); > } > > -- > 2.13.6 > Other than that, I think we the patch is on the right track. It should give us an opportunity to do file handle batching easily. -- Best regards, Pavel Shilovsky
ср, 30 окт. 2019 г. в 08:46, Pavel Shilovsky <piastryyy@gmail.com>: > > вт, 29 окт. 2019 г. в 19:59, Ronnie Sahlberg <lsahlber@redhat.com>: > > > > This patch moves the final part of the cifsFileInfo_put() logic where we > > need a write lock on lock_sem to be processed in a separate thread that > > holds no other locks. > > This is to prevent deadlocks like the one below: > > > > > there are 6 processes looping to while trying to down_write > > > cinode->lock_sem, 5 of them from _cifsFileInfo_put, and one from > > > cifs_new_fileinfo > > > > > > and there are 5 other processes which are blocked, several of them > > > waiting on either PG_writeback or PG_locked (which are both set), all > > > for the same page of the file > > > > > > 2 inode_lock() (inode->i_rwsem) for the file > > > 1 wait_on_page_writeback() for the page > > > 1 down_read(inode->i_rwsem) for the inode of the directory > > > 1 inode_lock()(inode->i_rwsem) for the inode of the directory > > > 1 __lock_page > > > > > > > > > so processes are blocked waiting on: > > > page flags PG_locked and PG_writeback for one specific page > > > inode->i_rwsem for the directory > > > inode->i_rwsem for the file > > > cifsInodeInflock_sem > > > > > > > > > > > > here are the more gory details (let me know if I need to provide > > > anything more/better): > > > > > > [0 00:48:22.765] [UN] PID: 8863 TASK: ffff8c691547c5c0 CPU: 3 > > > COMMAND: "reopen_file" > > > #0 [ffff9965007e3ba8] __schedule at ffffffff9b6e6095 > > > #1 [ffff9965007e3c38] schedule at ffffffff9b6e64df > > > #2 [ffff9965007e3c48] rwsem_down_write_slowpath at ffffffff9af283d7 > > > #3 [ffff9965007e3cb8] legitimize_path at ffffffff9b0f975d > > > #4 [ffff9965007e3d08] path_openat at ffffffff9b0fe55d > > > #5 [ffff9965007e3dd8] do_filp_open at ffffffff9b100a33 > > > #6 [ffff9965007e3ee0] do_sys_open at ffffffff9b0eb2d6 > > > #7 [ffff9965007e3f38] do_syscall_64 at ffffffff9ae04315 > > > * (I think legitimize_path is bogus) > > > > > > in path_openat > > > } else { > > > const char *s = path_init(nd, flags); > > > while (!(error = link_path_walk(s, nd)) && > > > (error = do_last(nd, file, op)) > 0) { <<<< > > > > > > do_last: > > > if (open_flag & O_CREAT) > > > inode_lock(dir->d_inode); <<<< > > > else > > > so it's trying to take inode->i_rwsem for the directory > > > > > > DENTRY INODE SUPERBLK TYPE PATH > > > ffff8c68bb8e79c0 ffff8c691158ef20 ffff8c6915bf9000 DIR /mnt/vm1_smb/ > > > inode.i_rwsem is ffff8c691158efc0 > > > > > > <struct rw_semaphore 0xffff8c691158efc0>: > > > owner: <struct task_struct 0xffff8c6914275d00> (UN - 8856 - > > > reopen_file), counter: 0x0000000000000003 > > > waitlist: 2 > > > 0xffff9965007e3c90 8863 reopen_file UN 0 1:29:22.926 > > > RWSEM_WAITING_FOR_WRITE > > > 0xffff996500393e00 9802 ls UN 0 1:17:26.700 > > > RWSEM_WAITING_FOR_READ > > > > > > > > > the owner of the inode.i_rwsem of the directory is: > > > > > > [0 00:00:00.109] [UN] PID: 8856 TASK: ffff8c6914275d00 CPU: 3 > > > COMMAND: "reopen_file" > > > #0 [ffff99650065b828] __schedule at ffffffff9b6e6095 > > > #1 [ffff99650065b8b8] schedule at ffffffff9b6e64df > > > #2 [ffff99650065b8c8] schedule_timeout at ffffffff9b6e9f89 > > > #3 [ffff99650065b940] msleep at ffffffff9af573a9 > > > #4 [ffff99650065b948] _cifsFileInfo_put.cold.63 at ffffffffc0a42dd6 [cifs] > > > #5 [ffff99650065ba38] cifs_writepage_locked at ffffffffc0a0b8f3 [cifs] > > > #6 [ffff99650065bab0] cifs_launder_page at ffffffffc0a0bb72 [cifs] > > > #7 [ffff99650065bb30] invalidate_inode_pages2_range at ffffffff9b04d4bd > > > #8 [ffff99650065bcb8] cifs_invalidate_mapping at ffffffffc0a11339 [cifs] > > > #9 [ffff99650065bcd0] cifs_revalidate_mapping at ffffffffc0a1139a [cifs] > > > #10 [ffff99650065bcf0] cifs_d_revalidate at ffffffffc0a014f6 [cifs] > > > #11 [ffff99650065bd08] path_openat at ffffffff9b0fe7f7 > > > #12 [ffff99650065bdd8] do_filp_open at ffffffff9b100a33 > > > #13 [ffff99650065bee0] do_sys_open at ffffffff9b0eb2d6 > > > #14 [ffff99650065bf38] do_syscall_64 at ffffffff9ae04315 > > > > > > cifs_launder_page is for page 0xffffd1e2c07d2480 > > > > > > crash> page.index,mapping,flags 0xffffd1e2c07d2480 > > > index = 0x8 > > > mapping = 0xffff8c68f3cd0db0 > > > flags = 0xfffffc0008095 > > > > > > PAGE-FLAG BIT VALUE > > > PG_locked 0 0000001 > > > PG_uptodate 2 0000004 > > > PG_lru 4 0000010 > > > PG_waiters 7 0000080 > > > PG_writeback 15 0008000 > > > > > > > > > inode is ffff8c68f3cd0c40 > > > inode.i_rwsem is ffff8c68f3cd0ce0 > > > DENTRY INODE SUPERBLK TYPE PATH > > > ffff8c68a1f1b480 ffff8c68f3cd0c40 ffff8c6915bf9000 REG > > > /mnt/vm1_smb/testfile.8853 > > > > > > > > > this process holds the inode->i_rwsem for the parent directory, is > > > laundering a page attached to the inode of the file it's opening, and in > > > _cifsFileInfo_put is trying to down_write the cifsInodeInflock_sem > > > for the file itself. > > > > > > > > > <struct rw_semaphore 0xffff8c68f3cd0ce0>: > > > owner: <struct task_struct 0xffff8c6914272e80> (UN - 8854 - > > > reopen_file), counter: 0x0000000000000003 > > > waitlist: 1 > > > 0xffff9965005dfd80 8855 reopen_file UN 0 1:29:22.912 > > > RWSEM_WAITING_FOR_WRITE > > > > > > this is the inode.i_rwsem for the file > > > > > > the owner: > > > > > > [0 00:48:22.739] [UN] PID: 8854 TASK: ffff8c6914272e80 CPU: 2 > > > COMMAND: "reopen_file" > > > #0 [ffff99650054fb38] __schedule at ffffffff9b6e6095 > > > #1 [ffff99650054fbc8] schedule at ffffffff9b6e64df > > > #2 [ffff99650054fbd8] io_schedule at ffffffff9b6e68e2 > > > #3 [ffff99650054fbe8] __lock_page at ffffffff9b03c56f > > > #4 [ffff99650054fc80] pagecache_get_page at ffffffff9b03dcdf > > > #5 [ffff99650054fcc0] grab_cache_page_write_begin at ffffffff9b03ef4c > > > #6 [ffff99650054fcd0] cifs_write_begin at ffffffffc0a064ec [cifs] > > > #7 [ffff99650054fd30] generic_perform_write at ffffffff9b03bba4 > > > #8 [ffff99650054fda8] __generic_file_write_iter at ffffffff9b04060a > > > #9 [ffff99650054fdf0] cifs_strict_writev.cold.70 at ffffffffc0a4469b [cifs] > > > #10 [ffff99650054fe48] new_sync_write at ffffffff9b0ec1dd > > > #11 [ffff99650054fed0] vfs_write at ffffffff9b0eed35 > > > #12 [ffff99650054ff00] ksys_write at ffffffff9b0eefd9 > > > #13 [ffff99650054ff38] do_syscall_64 at ffffffff9ae04315 > > > > > > the process holds the inode->i_rwsem for the file to which it's writing, > > > and is trying to __lock_page for the same page as in the other processes > > > > > > > > > the other tasks: > > > [0 00:00:00.028] [UN] PID: 8859 TASK: ffff8c6915479740 CPU: 2 > > > COMMAND: "reopen_file" > > > #0 [ffff9965007b39d8] __schedule at ffffffff9b6e6095 > > > #1 [ffff9965007b3a68] schedule at ffffffff9b6e64df > > > #2 [ffff9965007b3a78] schedule_timeout at ffffffff9b6e9f89 > > > #3 [ffff9965007b3af0] msleep at ffffffff9af573a9 > > > #4 [ffff9965007b3af8] cifs_new_fileinfo.cold.61 at ffffffffc0a42a07 [cifs] > > > #5 [ffff9965007b3b78] cifs_open at ffffffffc0a0709d [cifs] > > > #6 [ffff9965007b3cd8] do_dentry_open at ffffffff9b0e9b7a > > > #7 [ffff9965007b3d08] path_openat at ffffffff9b0fe34f > > > #8 [ffff9965007b3dd8] do_filp_open at ffffffff9b100a33 > > > #9 [ffff9965007b3ee0] do_sys_open at ffffffff9b0eb2d6 > > > #10 [ffff9965007b3f38] do_syscall_64 at ffffffff9ae04315 > > > > > > this is opening the file, and is trying to down_write cinode->lock_sem > > > > > > > > > [0 00:00:00.041] [UN] PID: 8860 TASK: ffff8c691547ae80 CPU: 2 > > > COMMAND: "reopen_file" > > > [0 00:00:00.057] [UN] PID: 8861 TASK: ffff8c6915478000 CPU: 3 > > > COMMAND: "reopen_file" > > > [0 00:00:00.059] [UN] PID: 8858 TASK: ffff8c6914271740 CPU: 2 > > > COMMAND: "reopen_file" > > > [0 00:00:00.109] [UN] PID: 8862 TASK: ffff8c691547dd00 CPU: 6 > > > COMMAND: "reopen_file" > > > #0 [ffff9965007c3c78] __schedule at ffffffff9b6e6095 > > > #1 [ffff9965007c3d08] schedule at ffffffff9b6e64df > > > #2 [ffff9965007c3d18] schedule_timeout at ffffffff9b6e9f89 > > > #3 [ffff9965007c3d90] msleep at ffffffff9af573a9 > > > #4 [ffff9965007c3d98] _cifsFileInfo_put.cold.63 at ffffffffc0a42dd6 [cifs] > > > #5 [ffff9965007c3e88] cifs_close at ffffffffc0a07aaf [cifs] > > > #6 [ffff9965007c3ea0] __fput at ffffffff9b0efa6e > > > #7 [ffff9965007c3ee8] task_work_run at ffffffff9aef1614 > > > #8 [ffff9965007c3f20] exit_to_usermode_loop at ffffffff9ae03d6f > > > #9 [ffff9965007c3f38] do_syscall_64 at ffffffff9ae0444c > > > > > > closing the file, and trying to down_write cifsi->lock_sem > > > > > > > > > [0 00:48:22.839] [UN] PID: 8857 TASK: ffff8c6914270000 CPU: 7 > > > COMMAND: "reopen_file" > > > #0 [ffff9965006a7cc8] __schedule at ffffffff9b6e6095 > > > #1 [ffff9965006a7d58] schedule at ffffffff9b6e64df > > > #2 [ffff9965006a7d68] io_schedule at ffffffff9b6e68e2 > > > #3 [ffff9965006a7d78] wait_on_page_bit at ffffffff9b03cac6 > > > #4 [ffff9965006a7e10] __filemap_fdatawait_range at ffffffff9b03b028 > > > #5 [ffff9965006a7ed8] filemap_write_and_wait at ffffffff9b040165 > > > #6 [ffff9965006a7ef0] cifs_flush at ffffffffc0a0c2fa [cifs] > > > #7 [ffff9965006a7f10] filp_close at ffffffff9b0e93f1 > > > #8 [ffff9965006a7f30] __x64_sys_close at ffffffff9b0e9a0e > > > #9 [ffff9965006a7f38] do_syscall_64 at ffffffff9ae04315 > > > > > > in __filemap_fdatawait_range > > > wait_on_page_writeback(page); > > > for the same page of the file > > > > > > > > > > > > [0 00:48:22.718] [UN] PID: 8855 TASK: ffff8c69142745c0 CPU: 7 > > > COMMAND: "reopen_file" > > > #0 [ffff9965005dfc98] __schedule at ffffffff9b6e6095 > > > #1 [ffff9965005dfd28] schedule at ffffffff9b6e64df > > > #2 [ffff9965005dfd38] rwsem_down_write_slowpath at ffffffff9af283d7 > > > #3 [ffff9965005dfdf0] cifs_strict_writev at ffffffffc0a0c40a [cifs] > > > #4 [ffff9965005dfe48] new_sync_write at ffffffff9b0ec1dd > > > #5 [ffff9965005dfed0] vfs_write at ffffffff9b0eed35 > > > #6 [ffff9965005dff00] ksys_write at ffffffff9b0eefd9 > > > #7 [ffff9965005dff38] do_syscall_64 at ffffffff9ae04315 > > > > > > inode_lock(inode); > > > > > > > > > and one 'ls' later on, to see whether the rest of the mount is available > > > (the test file is in the root, so we get blocked up on the directory > > > ->i_rwsem), so the entire mount is unavailable > > > > > > [0 00:36:26.473] [UN] PID: 9802 TASK: ffff8c691436ae80 CPU: 4 > > > COMMAND: "ls" > > > #0 [ffff996500393d28] __schedule at ffffffff9b6e6095 > > > #1 [ffff996500393db8] schedule at ffffffff9b6e64df > > > #2 [ffff996500393dc8] rwsem_down_read_slowpath at ffffffff9b6e9421 > > > #3 [ffff996500393e78] down_read_killable at ffffffff9b6e95e2 > > > #4 [ffff996500393e88] iterate_dir at ffffffff9b103c56 > > > #5 [ffff996500393ec8] ksys_getdents64 at ffffffff9b104b0c > > > #6 [ffff996500393f30] __x64_sys_getdents64 at ffffffff9b104bb6 > > > #7 [ffff996500393f38] do_syscall_64 at ffffffff9ae04315 > > > > > > in iterate_dir: > > > if (shared) > > > res = down_read_killable(&inode->i_rwsem); <<<< > > > else > > > res = down_write_killable(&inode->i_rwsem); > > > > > > > Signed-off-by: Ronnie Sahlberg <lsahlber@redhat.com> > > --- > > fs/cifs/cifsfs.c | 13 +++++++- > > fs/cifs/cifsglob.h | 3 +- > > fs/cifs/file.c | 91 ++++++++++++++++++++++++++++++++++-------------------- > > 3 files changed, 72 insertions(+), 35 deletions(-) > > > > diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c > > index e4e3b573d20c..f8e201c45ccb 100644 > > --- a/fs/cifs/cifsfs.c > > +++ b/fs/cifs/cifsfs.c > > @@ -119,6 +119,7 @@ extern mempool_t *cifs_mid_poolp; > > > > struct workqueue_struct *cifsiod_wq; > > struct workqueue_struct *decrypt_wq; > > +struct workqueue_struct *fileinfo_put_wq; > > struct workqueue_struct *cifsoplockd_wq; > > __u32 cifs_lock_secret; > > > > @@ -1557,11 +1558,18 @@ init_cifs(void) > > goto out_destroy_cifsiod_wq; > > } > > > > + fileinfo_put_wq = alloc_workqueue("cifsfileinfoput", > > + WQ_UNBOUND|WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); > > + if (!fileinfo_put_wq) { > > + rc = -ENOMEM; > > + goto out_destroy_decrypt_wq; > > + } > > + > > cifsoplockd_wq = alloc_workqueue("cifsoplockd", > > WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); > > if (!cifsoplockd_wq) { > > rc = -ENOMEM; > > - goto out_destroy_decrypt_wq; > > + goto out_destroy_fileinfo_put_wq; > > } > > > > rc = cifs_fscache_register(); > > @@ -1627,6 +1635,8 @@ init_cifs(void) > > cifs_fscache_unregister(); > > out_destroy_cifsoplockd_wq: > > destroy_workqueue(cifsoplockd_wq); > > +out_destroy_fileinfo_put_wq: > > + destroy_workqueue(fileinfo_put_wq); > > out_destroy_decrypt_wq: > > destroy_workqueue(decrypt_wq); > > out_destroy_cifsiod_wq: > > @@ -1656,6 +1666,7 @@ exit_cifs(void) > > cifs_fscache_unregister(); > > destroy_workqueue(cifsoplockd_wq); > > destroy_workqueue(decrypt_wq); > > + destroy_workqueue(fileinfo_put_wq); > > destroy_workqueue(cifsiod_wq); > > cifs_proc_clean(); > > } > > diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h > > index d78bfcc19156..0819b6498a3c 100644 > > --- a/fs/cifs/cifsglob.h > > +++ b/fs/cifs/cifsglob.h > > @@ -1265,6 +1265,7 @@ struct cifsFileInfo { > > struct mutex fh_mutex; /* prevents reopen race after dead ses*/ > > struct cifs_search_info srch_inf; > > struct work_struct oplock_break; /* work for oplock breaks */ > > + struct work_struct put; /* work for the final part of _put */ > > }; > > > > struct cifs_io_parms { > > @@ -1370,7 +1371,6 @@ cifsFileInfo_get_locked(struct cifsFileInfo *cifs_file) > > } > > > > struct cifsFileInfo *cifsFileInfo_get(struct cifsFileInfo *cifs_file); > > -void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_hdlr); > > void cifsFileInfo_put(struct cifsFileInfo *cifs_file); > > > > #define CIFS_CACHE_READ_FLG 1 > > @@ -1907,6 +1907,7 @@ void cifs_queue_oplock_break(struct cifsFileInfo *cfile); > > extern const struct slow_work_ops cifs_oplock_break_ops; > > extern struct workqueue_struct *cifsiod_wq; > > extern struct workqueue_struct *decrypt_wq; > > +extern struct workqueue_struct *fileinfo_put_wq; > > extern struct workqueue_struct *cifsoplockd_wq; > > extern __u32 cifs_lock_secret; > > > > diff --git a/fs/cifs/file.c b/fs/cifs/file.c > > index 67e7d0ffe385..d1dedb8fb31f 100644 > > --- a/fs/cifs/file.c > > +++ b/fs/cifs/file.c > > @@ -375,30 +375,7 @@ cifsFileInfo_get(struct cifsFileInfo *cifs_file) > > return cifs_file; > > } > > > > -/** > > - * cifsFileInfo_put - release a reference of file priv data > > - * > > - * Always potentially wait for oplock handler. See _cifsFileInfo_put(). > > - */ > > -void cifsFileInfo_put(struct cifsFileInfo *cifs_file) > > -{ > > - _cifsFileInfo_put(cifs_file, true); > > -} > > - > > -/** > > - * _cifsFileInfo_put - release a reference of file priv data > > - * > > - * This may involve closing the filehandle @cifs_file out on the > > - * server. Must be called without holding tcon->open_file_lock and > > - * cifs_file->file_info_lock. > > - * > > - * If @wait_for_oplock_handler is true and we are releasing the last > > - * reference, wait for any running oplock break handler of the file > > - * and cancel any pending one. If calling this function from the > > - * oplock break handler, you need to pass false. > > - * > > - */ > > -void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_handler) > > +void cifsFileInfo_put_final(struct cifsFileInfo *cifs_file) > > { > > struct inode *inode = d_inode(cifs_file->dentry); > > struct cifs_tcon *tcon = tlink_tcon(cifs_file->tlink); > > @@ -409,7 +386,6 @@ void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_handler) > > struct cifsLockInfo *li, *tmp; > > struct cifs_fid fid; > > struct cifs_pending_open open; > > - bool oplock_break_cancelled; > > > > spin_lock(&tcon->open_file_lock); > > spin_lock(&cifsi->open_file_lock); > > @@ -449,9 +425,6 @@ void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_handler) > > spin_unlock(&cifsi->open_file_lock); > > spin_unlock(&tcon->open_file_lock); > > > > - oplock_break_cancelled = wait_oplock_handler ? > > - cancel_work_sync(&cifs_file->oplock_break) : false; > > - > > if (!tcon->need_reconnect && !cifs_file->invalidHandle) { > > struct TCP_Server_Info *server = tcon->ses->server; > > unsigned int xid; > > @@ -462,9 +435,6 @@ void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_handler) > > _free_xid(xid); > > } > > > > - if (oplock_break_cancelled) > > - cifs_done_oplock_break(cifsi); > > - > > cifs_del_pending_open(&open); > > > > /* > > @@ -487,6 +457,61 @@ void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_handler) > > kfree(cifs_file); > > } > > > > +static void cifsFileInfo_put_work(struct work_struct *work) > > +{ > > + struct cifsFileInfo *cifs_file = container_of(work, > > + struct cifsFileInfo, put); > > + > > + cifsFileInfo_put_final(cifs_file); > > +} > > + > > +/** > > + * _cifsFileInfo_put - release a reference of file priv data > > + * > > + * This may involve closing the filehandle @cifs_file out on the > > + * server. Must be called without holding tcon->open_file_lock, > > + * cinode->open_file_lock and cifs_file->file_info_lock. > > + * > > + * If @wait_for_oplock_handler is true and we are releasing the last > > + * reference, wait for any running oplock break handler of the file > > + * and cancel any pending one. If calling this function from the > > + * oplock break handler, you need to pass false. > > + * > > + */ > > +void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, > > + bool wait_oplock_handler, bool offload) > > +{ > > + struct inode *inode = d_inode(cifs_file->dentry); > > + struct cifsInodeInfo *cifsi = CIFS_I(inode); > > + > > + spin_lock(&cifs_file->file_info_lock); > > + if (cifs_file->count > 1) { > > + cifs_file->count--; > > + spin_unlock(&cifs_file->file_info_lock); > > + return; > > + } > > + spin_unlock(&cifs_file->file_info_lock); > > + > > + if (wait_oplock_handler && cancel_work_sync(&cifs_file->oplock_break)) > > + cifs_done_oplock_break(cifsi); > > At this point the file info is still in the lists and can be > referenced by other threads not only including oplock break routine. > So, we can't cancel oplock break here - it would lead to big timeouts > to other clients trying to open the file because the server will be > waiting for an oplock break ack. > > Let's do it in cifsFileInfo_put_final() as it was originally in > cifsFileInfo_put() and we do not need an extra *wait_oplock_handler* > argument - just cancel it unconditionally. If a caller of this > function don't want to deadlock on himself (like cifs_oplock_break) it > will just specify *offload* as True. > > > + > > + if (offload) { > > + INIT_WORK(&cifs_file->put, cifsFileInfo_put_work); INIT_WORK should be done when initializing cifsFileInfo structure. > > + queue_work(fileinfo_put_wq, &cifs_file->put); As I mentioned in the parallel discussion queue_work here may return False if the same work has been already queued. It may happen in the following scenario: T1 gets reference - 1 T1 is in putting the reference - checked ref > 1 T2 gets reference - 2 T1 queues work T3 (worker) puts the reference - 1 but still didn't exit and remains in the queue T2 is in putting the reference - checked ref >1 T2 queues work and it returns false (!) <--- error Sorry, it wasn't obvious to me when I was suggesting this yesterday. One way to fix this is to check queue_work output and "goto" to the beginning of the function if False. But there is always a probability that we end up in the same situation and it repeats and repeats again. I think it should resolve itself at some point in time but it doesn't look like a good approach. Another way to fix it is to revert the patch to the state it was at yesterday - to offload only the remaining part of cifsFileInfo_put() which will allow us to avoid the race from above. I am personally leaning towards this way. Other may comments about eliminating the need of *wait_oplock_handler* and INIT_WORK stand for yesterday's patch too. Thoughts? > > + } else > > + cifsFileInfo_put_final(cifs_file); > > +} > > + > > +/** > > + * cifsFileInfo_put - release a reference of file priv data > > + * > > + * Always potentially wait for oplock handler. See _cifsFileInfo_put(). > > + */ > > +void cifsFileInfo_put(struct cifsFileInfo *cifs_file) > > +{ > > + _cifsFileInfo_put(cifs_file, true, true); > > +} > > + > > int cifs_open(struct inode *inode, struct file *file) > > > > { > > @@ -808,7 +833,7 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush) > > int cifs_close(struct inode *inode, struct file *file) > > { > > if (file->private_data != NULL) { > > - cifsFileInfo_put(file->private_data); > > + _cifsFileInfo_put(file->private_data, true, false); > > file->private_data = NULL; > > } > > > > @@ -4742,7 +4767,7 @@ void cifs_oplock_break(struct work_struct *work) > > cinode); > > cifs_dbg(FYI, "Oplock release rc = %d\n", rc); > > } > > - _cifsFileInfo_put(cfile, false /* do not wait for ourself */); > > + _cifsFileInfo_put(cfile, false /* do not wait for ourself */, true); > > As mentioned above, here we only need *offload* as True to not wait > for ourselves. > > > cifs_done_oplock_break(cinode); > > } > > > > -- > > 2.13.6 > > > > Other than that, I think we the patch is on the right track. It should > give us an opportunity to do file handle batching easily. > > -- > Best regards, > Pavel Shilovsky
----- Original Message ----- > From: "Pavel Shilovsky" <piastryyy@gmail.com> > To: "Ronnie Sahlberg" <lsahlber@redhat.com> > Cc: "linux-cifs" <linux-cifs@vger.kernel.org> > Sent: Thursday, 31 October, 2019 2:56:11 AM > Subject: Re: [PATCH] cifs: move cifsFileInfo_put logic into a work-queue > > ср, 30 окт. 2019 г. в 08:46, Pavel Shilovsky <piastryyy@gmail.com>: > > > > вт, 29 окт. 2019 г. в 19:59, Ronnie Sahlberg <lsahlber@redhat.com>: > > > > > > This patch moves the final part of the cifsFileInfo_put() logic where we > > > need a write lock on lock_sem to be processed in a separate thread that > > > holds no other locks. > > > This is to prevent deadlocks like the one below: > > > > > > > there are 6 processes looping to while trying to down_write > > > > cinode->lock_sem, 5 of them from _cifsFileInfo_put, and one from > > > > cifs_new_fileinfo > > > > > > > > and there are 5 other processes which are blocked, several of them > > > > waiting on either PG_writeback or PG_locked (which are both set), all > > > > for the same page of the file > > > > > > > > 2 inode_lock() (inode->i_rwsem) for the file > > > > 1 wait_on_page_writeback() for the page > > > > 1 down_read(inode->i_rwsem) for the inode of the directory > > > > 1 inode_lock()(inode->i_rwsem) for the inode of the directory > > > > 1 __lock_page > > > > > > > > > > > > so processes are blocked waiting on: > > > > page flags PG_locked and PG_writeback for one specific page > > > > inode->i_rwsem for the directory > > > > inode->i_rwsem for the file > > > > cifsInodeInflock_sem > > > > > > > > > > > > > > > > here are the more gory details (let me know if I need to provide > > > > anything more/better): > > > > > > > > [0 00:48:22.765] [UN] PID: 8863 TASK: ffff8c691547c5c0 CPU: 3 > > > > COMMAND: "reopen_file" > > > > #0 [ffff9965007e3ba8] __schedule at ffffffff9b6e6095 > > > > #1 [ffff9965007e3c38] schedule at ffffffff9b6e64df > > > > #2 [ffff9965007e3c48] rwsem_down_write_slowpath at ffffffff9af283d7 > > > > #3 [ffff9965007e3cb8] legitimize_path at ffffffff9b0f975d > > > > #4 [ffff9965007e3d08] path_openat at ffffffff9b0fe55d > > > > #5 [ffff9965007e3dd8] do_filp_open at ffffffff9b100a33 > > > > #6 [ffff9965007e3ee0] do_sys_open at ffffffff9b0eb2d6 > > > > #7 [ffff9965007e3f38] do_syscall_64 at ffffffff9ae04315 > > > > * (I think legitimize_path is bogus) > > > > > > > > in path_openat > > > > } else { > > > > const char *s = path_init(nd, flags); > > > > while (!(error = link_path_walk(s, nd)) && > > > > (error = do_last(nd, file, op)) > 0) { <<<< > > > > > > > > do_last: > > > > if (open_flag & O_CREAT) > > > > inode_lock(dir->d_inode); <<<< > > > > else > > > > so it's trying to take inode->i_rwsem for the directory > > > > > > > > DENTRY INODE SUPERBLK TYPE PATH > > > > ffff8c68bb8e79c0 ffff8c691158ef20 ffff8c6915bf9000 DIR /mnt/vm1_smb/ > > > > inode.i_rwsem is ffff8c691158efc0 > > > > > > > > <struct rw_semaphore 0xffff8c691158efc0>: > > > > owner: <struct task_struct 0xffff8c6914275d00> (UN - 8856 - > > > > reopen_file), counter: 0x0000000000000003 > > > > waitlist: 2 > > > > 0xffff9965007e3c90 8863 reopen_file UN 0 > > > > 1:29:22.926 > > > > RWSEM_WAITING_FOR_WRITE > > > > 0xffff996500393e00 9802 ls UN 0 > > > > 1:17:26.700 > > > > RWSEM_WAITING_FOR_READ > > > > > > > > > > > > the owner of the inode.i_rwsem of the directory is: > > > > > > > > [0 00:00:00.109] [UN] PID: 8856 TASK: ffff8c6914275d00 CPU: 3 > > > > COMMAND: "reopen_file" > > > > #0 [ffff99650065b828] __schedule at ffffffff9b6e6095 > > > > #1 [ffff99650065b8b8] schedule at ffffffff9b6e64df > > > > #2 [ffff99650065b8c8] schedule_timeout at ffffffff9b6e9f89 > > > > #3 [ffff99650065b940] msleep at ffffffff9af573a9 > > > > #4 [ffff99650065b948] _cifsFileInfo_put.cold.63 at ffffffffc0a42dd6 > > > > [cifs] > > > > #5 [ffff99650065ba38] cifs_writepage_locked at ffffffffc0a0b8f3 [cifs] > > > > #6 [ffff99650065bab0] cifs_launder_page at ffffffffc0a0bb72 [cifs] > > > > #7 [ffff99650065bb30] invalidate_inode_pages2_range at > > > > ffffffff9b04d4bd > > > > #8 [ffff99650065bcb8] cifs_invalidate_mapping at ffffffffc0a11339 > > > > [cifs] > > > > #9 [ffff99650065bcd0] cifs_revalidate_mapping at ffffffffc0a1139a > > > > [cifs] > > > > #10 [ffff99650065bcf0] cifs_d_revalidate at ffffffffc0a014f6 [cifs] > > > > #11 [ffff99650065bd08] path_openat at ffffffff9b0fe7f7 > > > > #12 [ffff99650065bdd8] do_filp_open at ffffffff9b100a33 > > > > #13 [ffff99650065bee0] do_sys_open at ffffffff9b0eb2d6 > > > > #14 [ffff99650065bf38] do_syscall_64 at ffffffff9ae04315 > > > > > > > > cifs_launder_page is for page 0xffffd1e2c07d2480 > > > > > > > > crash> page.index,mapping,flags 0xffffd1e2c07d2480 > > > > index = 0x8 > > > > mapping = 0xffff8c68f3cd0db0 > > > > flags = 0xfffffc0008095 > > > > > > > > PAGE-FLAG BIT VALUE > > > > PG_locked 0 0000001 > > > > PG_uptodate 2 0000004 > > > > PG_lru 4 0000010 > > > > PG_waiters 7 0000080 > > > > PG_writeback 15 0008000 > > > > > > > > > > > > inode is ffff8c68f3cd0c40 > > > > inode.i_rwsem is ffff8c68f3cd0ce0 > > > > DENTRY INODE SUPERBLK TYPE PATH > > > > ffff8c68a1f1b480 ffff8c68f3cd0c40 ffff8c6915bf9000 REG > > > > /mnt/vm1_smb/testfile.8853 > > > > > > > > > > > > this process holds the inode->i_rwsem for the parent directory, is > > > > laundering a page attached to the inode of the file it's opening, and > > > > in > > > > _cifsFileInfo_put is trying to down_write the cifsInodeInflock_sem > > > > for the file itself. > > > > > > > > > > > > <struct rw_semaphore 0xffff8c68f3cd0ce0>: > > > > owner: <struct task_struct 0xffff8c6914272e80> (UN - 8854 - > > > > reopen_file), counter: 0x0000000000000003 > > > > waitlist: 1 > > > > 0xffff9965005dfd80 8855 reopen_file UN 0 > > > > 1:29:22.912 > > > > RWSEM_WAITING_FOR_WRITE > > > > > > > > this is the inode.i_rwsem for the file > > > > > > > > the owner: > > > > > > > > [0 00:48:22.739] [UN] PID: 8854 TASK: ffff8c6914272e80 CPU: 2 > > > > COMMAND: "reopen_file" > > > > #0 [ffff99650054fb38] __schedule at ffffffff9b6e6095 > > > > #1 [ffff99650054fbc8] schedule at ffffffff9b6e64df > > > > #2 [ffff99650054fbd8] io_schedule at ffffffff9b6e68e2 > > > > #3 [ffff99650054fbe8] __lock_page at ffffffff9b03c56f > > > > #4 [ffff99650054fc80] pagecache_get_page at ffffffff9b03dcdf > > > > #5 [ffff99650054fcc0] grab_cache_page_write_begin at ffffffff9b03ef4c > > > > #6 [ffff99650054fcd0] cifs_write_begin at ffffffffc0a064ec [cifs] > > > > #7 [ffff99650054fd30] generic_perform_write at ffffffff9b03bba4 > > > > #8 [ffff99650054fda8] __generic_file_write_iter at ffffffff9b04060a > > > > #9 [ffff99650054fdf0] cifs_strict_writev.cold.70 at ffffffffc0a4469b > > > > [cifs] > > > > #10 [ffff99650054fe48] new_sync_write at ffffffff9b0ec1dd > > > > #11 [ffff99650054fed0] vfs_write at ffffffff9b0eed35 > > > > #12 [ffff99650054ff00] ksys_write at ffffffff9b0eefd9 > > > > #13 [ffff99650054ff38] do_syscall_64 at ffffffff9ae04315 > > > > > > > > the process holds the inode->i_rwsem for the file to which it's > > > > writing, > > > > and is trying to __lock_page for the same page as in the other > > > > processes > > > > > > > > > > > > the other tasks: > > > > [0 00:00:00.028] [UN] PID: 8859 TASK: ffff8c6915479740 CPU: 2 > > > > COMMAND: "reopen_file" > > > > #0 [ffff9965007b39d8] __schedule at ffffffff9b6e6095 > > > > #1 [ffff9965007b3a68] schedule at ffffffff9b6e64df > > > > #2 [ffff9965007b3a78] schedule_timeout at ffffffff9b6e9f89 > > > > #3 [ffff9965007b3af0] msleep at ffffffff9af573a9 > > > > #4 [ffff9965007b3af8] cifs_new_fileinfo.cold.61 at ffffffffc0a42a07 > > > > [cifs] > > > > #5 [ffff9965007b3b78] cifs_open at ffffffffc0a0709d [cifs] > > > > #6 [ffff9965007b3cd8] do_dentry_open at ffffffff9b0e9b7a > > > > #7 [ffff9965007b3d08] path_openat at ffffffff9b0fe34f > > > > #8 [ffff9965007b3dd8] do_filp_open at ffffffff9b100a33 > > > > #9 [ffff9965007b3ee0] do_sys_open at ffffffff9b0eb2d6 > > > > #10 [ffff9965007b3f38] do_syscall_64 at ffffffff9ae04315 > > > > > > > > this is opening the file, and is trying to down_write cinode->lock_sem > > > > > > > > > > > > [0 00:00:00.041] [UN] PID: 8860 TASK: ffff8c691547ae80 CPU: 2 > > > > COMMAND: "reopen_file" > > > > [0 00:00:00.057] [UN] PID: 8861 TASK: ffff8c6915478000 CPU: 3 > > > > COMMAND: "reopen_file" > > > > [0 00:00:00.059] [UN] PID: 8858 TASK: ffff8c6914271740 CPU: 2 > > > > COMMAND: "reopen_file" > > > > [0 00:00:00.109] [UN] PID: 8862 TASK: ffff8c691547dd00 CPU: 6 > > > > COMMAND: "reopen_file" > > > > #0 [ffff9965007c3c78] __schedule at ffffffff9b6e6095 > > > > #1 [ffff9965007c3d08] schedule at ffffffff9b6e64df > > > > #2 [ffff9965007c3d18] schedule_timeout at ffffffff9b6e9f89 > > > > #3 [ffff9965007c3d90] msleep at ffffffff9af573a9 > > > > #4 [ffff9965007c3d98] _cifsFileInfo_put.cold.63 at ffffffffc0a42dd6 > > > > [cifs] > > > > #5 [ffff9965007c3e88] cifs_close at ffffffffc0a07aaf [cifs] > > > > #6 [ffff9965007c3ea0] __fput at ffffffff9b0efa6e > > > > #7 [ffff9965007c3ee8] task_work_run at ffffffff9aef1614 > > > > #8 [ffff9965007c3f20] exit_to_usermode_loop at ffffffff9ae03d6f > > > > #9 [ffff9965007c3f38] do_syscall_64 at ffffffff9ae0444c > > > > > > > > closing the file, and trying to down_write cifsi->lock_sem > > > > > > > > > > > > [0 00:48:22.839] [UN] PID: 8857 TASK: ffff8c6914270000 CPU: 7 > > > > COMMAND: "reopen_file" > > > > #0 [ffff9965006a7cc8] __schedule at ffffffff9b6e6095 > > > > #1 [ffff9965006a7d58] schedule at ffffffff9b6e64df > > > > #2 [ffff9965006a7d68] io_schedule at ffffffff9b6e68e2 > > > > #3 [ffff9965006a7d78] wait_on_page_bit at ffffffff9b03cac6 > > > > #4 [ffff9965006a7e10] __filemap_fdatawait_range at ffffffff9b03b028 > > > > #5 [ffff9965006a7ed8] filemap_write_and_wait at ffffffff9b040165 > > > > #6 [ffff9965006a7ef0] cifs_flush at ffffffffc0a0c2fa [cifs] > > > > #7 [ffff9965006a7f10] filp_close at ffffffff9b0e93f1 > > > > #8 [ffff9965006a7f30] __x64_sys_close at ffffffff9b0e9a0e > > > > #9 [ffff9965006a7f38] do_syscall_64 at ffffffff9ae04315 > > > > > > > > in __filemap_fdatawait_range > > > > wait_on_page_writeback(page); > > > > for the same page of the file > > > > > > > > > > > > > > > > [0 00:48:22.718] [UN] PID: 8855 TASK: ffff8c69142745c0 CPU: 7 > > > > COMMAND: "reopen_file" > > > > #0 [ffff9965005dfc98] __schedule at ffffffff9b6e6095 > > > > #1 [ffff9965005dfd28] schedule at ffffffff9b6e64df > > > > #2 [ffff9965005dfd38] rwsem_down_write_slowpath at ffffffff9af283d7 > > > > #3 [ffff9965005dfdf0] cifs_strict_writev at ffffffffc0a0c40a [cifs] > > > > #4 [ffff9965005dfe48] new_sync_write at ffffffff9b0ec1dd > > > > #5 [ffff9965005dfed0] vfs_write at ffffffff9b0eed35 > > > > #6 [ffff9965005dff00] ksys_write at ffffffff9b0eefd9 > > > > #7 [ffff9965005dff38] do_syscall_64 at ffffffff9ae04315 > > > > > > > > inode_lock(inode); > > > > > > > > > > > > and one 'ls' later on, to see whether the rest of the mount is > > > > available > > > > (the test file is in the root, so we get blocked up on the directory > > > > ->i_rwsem), so the entire mount is unavailable > > > > > > > > [0 00:36:26.473] [UN] PID: 9802 TASK: ffff8c691436ae80 CPU: 4 > > > > COMMAND: "ls" > > > > #0 [ffff996500393d28] __schedule at ffffffff9b6e6095 > > > > #1 [ffff996500393db8] schedule at ffffffff9b6e64df > > > > #2 [ffff996500393dc8] rwsem_down_read_slowpath at ffffffff9b6e9421 > > > > #3 [ffff996500393e78] down_read_killable at ffffffff9b6e95e2 > > > > #4 [ffff996500393e88] iterate_dir at ffffffff9b103c56 > > > > #5 [ffff996500393ec8] ksys_getdents64 at ffffffff9b104b0c > > > > #6 [ffff996500393f30] __x64_sys_getdents64 at ffffffff9b104bb6 > > > > #7 [ffff996500393f38] do_syscall_64 at ffffffff9ae04315 > > > > > > > > in iterate_dir: > > > > if (shared) > > > > res = down_read_killable(&inode->i_rwsem); <<<< > > > > else > > > > res = down_write_killable(&inode->i_rwsem); > > > > > > > > > > Signed-off-by: Ronnie Sahlberg <lsahlber@redhat.com> > > > --- > > > fs/cifs/cifsfs.c | 13 +++++++- > > > fs/cifs/cifsglob.h | 3 +- > > > fs/cifs/file.c | 91 > > > ++++++++++++++++++++++++++++++++++-------------------- > > > 3 files changed, 72 insertions(+), 35 deletions(-) > > > > > > diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c > > > index e4e3b573d20c..f8e201c45ccb 100644 > > > --- a/fs/cifs/cifsfs.c > > > +++ b/fs/cifs/cifsfs.c > > > @@ -119,6 +119,7 @@ extern mempool_t *cifs_mid_poolp; > > > > > > struct workqueue_struct *cifsiod_wq; > > > struct workqueue_struct *decrypt_wq; > > > +struct workqueue_struct *fileinfo_put_wq; > > > struct workqueue_struct *cifsoplockd_wq; > > > __u32 cifs_lock_secret; > > > > > > @@ -1557,11 +1558,18 @@ init_cifs(void) > > > goto out_destroy_cifsiod_wq; > > > } > > > > > > + fileinfo_put_wq = alloc_workqueue("cifsfileinfoput", > > > + > > > WQ_UNBOUND|WQ_FREEZABLE|WQ_MEM_RECLAIM, > > > 0); > > > + if (!fileinfo_put_wq) { > > > + rc = -ENOMEM; > > > + goto out_destroy_decrypt_wq; > > > + } > > > + > > > cifsoplockd_wq = alloc_workqueue("cifsoplockd", > > > WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); > > > if (!cifsoplockd_wq) { > > > rc = -ENOMEM; > > > - goto out_destroy_decrypt_wq; > > > + goto out_destroy_fileinfo_put_wq; > > > } > > > > > > rc = cifs_fscache_register(); > > > @@ -1627,6 +1635,8 @@ init_cifs(void) > > > cifs_fscache_unregister(); > > > out_destroy_cifsoplockd_wq: > > > destroy_workqueue(cifsoplockd_wq); > > > +out_destroy_fileinfo_put_wq: > > > + destroy_workqueue(fileinfo_put_wq); > > > out_destroy_decrypt_wq: > > > destroy_workqueue(decrypt_wq); > > > out_destroy_cifsiod_wq: > > > @@ -1656,6 +1666,7 @@ exit_cifs(void) > > > cifs_fscache_unregister(); > > > destroy_workqueue(cifsoplockd_wq); > > > destroy_workqueue(decrypt_wq); > > > + destroy_workqueue(fileinfo_put_wq); > > > destroy_workqueue(cifsiod_wq); > > > cifs_proc_clean(); > > > } > > > diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h > > > index d78bfcc19156..0819b6498a3c 100644 > > > --- a/fs/cifs/cifsglob.h > > > +++ b/fs/cifs/cifsglob.h > > > @@ -1265,6 +1265,7 @@ struct cifsFileInfo { > > > struct mutex fh_mutex; /* prevents reopen race after dead ses*/ > > > struct cifs_search_info srch_inf; > > > struct work_struct oplock_break; /* work for oplock breaks */ > > > + struct work_struct put; /* work for the final part of _put */ > > > }; > > > > > > struct cifs_io_parms { > > > @@ -1370,7 +1371,6 @@ cifsFileInfo_get_locked(struct cifsFileInfo > > > *cifs_file) > > > } > > > > > > struct cifsFileInfo *cifsFileInfo_get(struct cifsFileInfo *cifs_file); > > > -void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool > > > wait_oplock_hdlr); > > > void cifsFileInfo_put(struct cifsFileInfo *cifs_file); > > > > > > #define CIFS_CACHE_READ_FLG 1 > > > @@ -1907,6 +1907,7 @@ void cifs_queue_oplock_break(struct cifsFileInfo > > > *cfile); > > > extern const struct slow_work_ops cifs_oplock_break_ops; > > > extern struct workqueue_struct *cifsiod_wq; > > > extern struct workqueue_struct *decrypt_wq; > > > +extern struct workqueue_struct *fileinfo_put_wq; > > > extern struct workqueue_struct *cifsoplockd_wq; > > > extern __u32 cifs_lock_secret; > > > > > > diff --git a/fs/cifs/file.c b/fs/cifs/file.c > > > index 67e7d0ffe385..d1dedb8fb31f 100644 > > > --- a/fs/cifs/file.c > > > +++ b/fs/cifs/file.c > > > @@ -375,30 +375,7 @@ cifsFileInfo_get(struct cifsFileInfo *cifs_file) > > > return cifs_file; > > > } > > > > > > -/** > > > - * cifsFileInfo_put - release a reference of file priv data > > > - * > > > - * Always potentially wait for oplock handler. See _cifsFileInfo_put(). > > > - */ > > > -void cifsFileInfo_put(struct cifsFileInfo *cifs_file) > > > -{ > > > - _cifsFileInfo_put(cifs_file, true); > > > -} > > > - > > > -/** > > > - * _cifsFileInfo_put - release a reference of file priv data > > > - * > > > - * This may involve closing the filehandle @cifs_file out on the > > > - * server. Must be called without holding tcon->open_file_lock and > > > - * cifs_file->file_info_lock. > > > - * > > > - * If @wait_for_oplock_handler is true and we are releasing the last > > > - * reference, wait for any running oplock break handler of the file > > > - * and cancel any pending one. If calling this function from the > > > - * oplock break handler, you need to pass false. > > > - * > > > - */ > > > -void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool > > > wait_oplock_handler) > > > +void cifsFileInfo_put_final(struct cifsFileInfo *cifs_file) > > > { > > > struct inode *inode = d_inode(cifs_file->dentry); > > > struct cifs_tcon *tcon = tlink_tcon(cifs_file->tlink); > > > @@ -409,7 +386,6 @@ void _cifsFileInfo_put(struct cifsFileInfo > > > *cifs_file, bool wait_oplock_handler) > > > struct cifsLockInfo *li, *tmp; > > > struct cifs_fid fid; > > > struct cifs_pending_open open; > > > - bool oplock_break_cancelled; > > > > > > spin_lock(&tcon->open_file_lock); > > > spin_lock(&cifsi->open_file_lock); > > > @@ -449,9 +425,6 @@ void _cifsFileInfo_put(struct cifsFileInfo > > > *cifs_file, bool wait_oplock_handler) > > > spin_unlock(&cifsi->open_file_lock); > > > spin_unlock(&tcon->open_file_lock); > > > > > > - oplock_break_cancelled = wait_oplock_handler ? > > > - cancel_work_sync(&cifs_file->oplock_break) : false; > > > - > > > if (!tcon->need_reconnect && !cifs_file->invalidHandle) { > > > struct TCP_Server_Info *server = tcon->ses->server; > > > unsigned int xid; > > > @@ -462,9 +435,6 @@ void _cifsFileInfo_put(struct cifsFileInfo > > > *cifs_file, bool wait_oplock_handler) > > > _free_xid(xid); > > > } > > > > > > - if (oplock_break_cancelled) > > > - cifs_done_oplock_break(cifsi); > > > - > > > cifs_del_pending_open(&open); > > > > > > /* > > > @@ -487,6 +457,61 @@ void _cifsFileInfo_put(struct cifsFileInfo > > > *cifs_file, bool wait_oplock_handler) > > > kfree(cifs_file); > > > } > > > > > > +static void cifsFileInfo_put_work(struct work_struct *work) > > > +{ > > > + struct cifsFileInfo *cifs_file = container_of(work, > > > + struct cifsFileInfo, put); > > > + > > > + cifsFileInfo_put_final(cifs_file); > > > +} > > > + > > > +/** > > > + * _cifsFileInfo_put - release a reference of file priv data > > > + * > > > + * This may involve closing the filehandle @cifs_file out on the > > > + * server. Must be called without holding tcon->open_file_lock, > > > + * cinode->open_file_lock and cifs_file->file_info_lock. > > > + * > > > + * If @wait_for_oplock_handler is true and we are releasing the last > > > + * reference, wait for any running oplock break handler of the file > > > + * and cancel any pending one. If calling this function from the > > > + * oplock break handler, you need to pass false. > > > + * > > > + */ > > > +void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, > > > + bool wait_oplock_handler, bool offload) > > > +{ > > > + struct inode *inode = d_inode(cifs_file->dentry); > > > + struct cifsInodeInfo *cifsi = CIFS_I(inode); > > > + > > > + spin_lock(&cifs_file->file_info_lock); > > > + if (cifs_file->count > 1) { > > > + cifs_file->count--; > > > + spin_unlock(&cifs_file->file_info_lock); > > > + return; > > > + } > > > + spin_unlock(&cifs_file->file_info_lock); > > > + > > > + if (wait_oplock_handler && > > > cancel_work_sync(&cifs_file->oplock_break)) > > > + cifs_done_oplock_break(cifsi); > > > > At this point the file info is still in the lists and can be > > referenced by other threads not only including oplock break routine. > > So, we can't cancel oplock break here - it would lead to big timeouts > > to other clients trying to open the file because the server will be > > waiting for an oplock break ack. > > > > Let's do it in cifsFileInfo_put_final() as it was originally in > > cifsFileInfo_put() and we do not need an extra *wait_oplock_handler* > > argument - just cancel it unconditionally. If a caller of this > > function don't want to deadlock on himself (like cifs_oplock_break) it > > will just specify *offload* as True. > > > > > + > > > + if (offload) { > > > + INIT_WORK(&cifs_file->put, cifsFileInfo_put_work); > > INIT_WORK should be done when initializing cifsFileInfo structure. > > > > > + queue_work(fileinfo_put_wq, &cifs_file->put); > > As I mentioned in the parallel discussion queue_work here may return > False if the same work has been already queued. It may happen in the > following scenario: > > T1 gets reference - 1 > T1 is in putting the reference - checked ref > 1 > T2 gets reference - 2 > T1 queues work > T3 (worker) puts the reference - 1 but still didn't exit and remains > in the queue > T2 is in putting the reference - checked ref >1 > T2 queues work and it returns false (!) <--- error > > Sorry, it wasn't obvious to me when I was suggesting this yesterday. > > One way to fix this is to check queue_work output and "goto" to the > beginning of the function if False. But there is always a probability > that we end up in the same situation and it repeats and repeats again. > I think it should resolve itself at some point in time but it doesn't > look like a good approach. > > Another way to fix it is to revert the patch to the state it was at > yesterday - to offload only the remaining part of cifsFileInfo_put() > which will allow us to avoid the race from above. I am personally > leaning towards this way. I think so too. Can you do the changes below (wait_oplock_handler/INIT_WORK) to yesterday's patch for how you want them to look like and send it to the list? > > Other may comments about eliminating the need of *wait_oplock_handler* > and INIT_WORK stand for yesterday's patch too. > > Thoughts? > > > > > + } else > > > + cifsFileInfo_put_final(cifs_file); > > > +} > > > + > > > +/** > > > + * cifsFileInfo_put - release a reference of file priv data > > > + * > > > + * Always potentially wait for oplock handler. See _cifsFileInfo_put(). > > > + */ > > > +void cifsFileInfo_put(struct cifsFileInfo *cifs_file) > > > +{ > > > + _cifsFileInfo_put(cifs_file, true, true); > > > +} > > > + > > > int cifs_open(struct inode *inode, struct file *file) > > > > > > { > > > @@ -808,7 +833,7 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool > > > can_flush) > > > int cifs_close(struct inode *inode, struct file *file) > > > { > > > if (file->private_data != NULL) { > > > - cifsFileInfo_put(file->private_data); > > > + _cifsFileInfo_put(file->private_data, true, false); > > > file->private_data = NULL; > > > } > > > > > > @@ -4742,7 +4767,7 @@ void cifs_oplock_break(struct work_struct *work) > > > cinode); > > > cifs_dbg(FYI, "Oplock release rc = %d\n", rc); > > > } > > > - _cifsFileInfo_put(cfile, false /* do not wait for ourself */); > > > + _cifsFileInfo_put(cfile, false /* do not wait for ourself */, > > > true); > > > > As mentioned above, here we only need *offload* as True to not wait > > for ourselves. > > > > > cifs_done_oplock_break(cinode); > > > } > > > > > > -- > > > 2.13.6 > > > > > > > Other than that, I think we the patch is on the right track. It should > > give us an opportunity to do file handle batching easily. > > > > -- > > Best regards, > > Pavel Shilovsky >
ср, 30 окт. 2019 г. в 18:04, Ronnie Sahlberg <lsahlber@redhat.com>: > > > > > > ----- Original Message ----- > > From: "Pavel Shilovsky" <piastryyy@gmail.com> > > To: "Ronnie Sahlberg" <lsahlber@redhat.com> > > Cc: "linux-cifs" <linux-cifs@vger.kernel.org> > > Sent: Thursday, 31 October, 2019 2:56:11 AM > > Subject: Re: [PATCH] cifs: move cifsFileInfo_put logic into a work-queue > > > > ср, 30 окт. 2019 г. в 08:46, Pavel Shilovsky <piastryyy@gmail.com>: > > > > > > вт, 29 окт. 2019 г. в 19:59, Ronnie Sahlberg <lsahlber@redhat.com>: > > > > > > > > This patch moves the final part of the cifsFileInfo_put() logic where we > > > > need a write lock on lock_sem to be processed in a separate thread that > > > > holds no other locks. > > > > This is to prevent deadlocks like the one below: > > > > > > > > > there are 6 processes looping to while trying to down_write > > > > > cinode->lock_sem, 5 of them from _cifsFileInfo_put, and one from > > > > > cifs_new_fileinfo > > > > > > > > > > and there are 5 other processes which are blocked, several of them > > > > > waiting on either PG_writeback or PG_locked (which are both set), all > > > > > for the same page of the file > > > > > > > > > > 2 inode_lock() (inode->i_rwsem) for the file > > > > > 1 wait_on_page_writeback() for the page > > > > > 1 down_read(inode->i_rwsem) for the inode of the directory > > > > > 1 inode_lock()(inode->i_rwsem) for the inode of the directory > > > > > 1 __lock_page > > > > > > > > > > > > > > > so processes are blocked waiting on: > > > > > page flags PG_locked and PG_writeback for one specific page > > > > > inode->i_rwsem for the directory > > > > > inode->i_rwsem for the file > > > > > cifsInodeInflock_sem > > > > > > > > > > > > > > > > > > > > here are the more gory details (let me know if I need to provide > > > > > anything more/better): > > > > > > > > > > [0 00:48:22.765] [UN] PID: 8863 TASK: ffff8c691547c5c0 CPU: 3 > > > > > COMMAND: "reopen_file" > > > > > #0 [ffff9965007e3ba8] __schedule at ffffffff9b6e6095 > > > > > #1 [ffff9965007e3c38] schedule at ffffffff9b6e64df > > > > > #2 [ffff9965007e3c48] rwsem_down_write_slowpath at ffffffff9af283d7 > > > > > #3 [ffff9965007e3cb8] legitimize_path at ffffffff9b0f975d > > > > > #4 [ffff9965007e3d08] path_openat at ffffffff9b0fe55d > > > > > #5 [ffff9965007e3dd8] do_filp_open at ffffffff9b100a33 > > > > > #6 [ffff9965007e3ee0] do_sys_open at ffffffff9b0eb2d6 > > > > > #7 [ffff9965007e3f38] do_syscall_64 at ffffffff9ae04315 > > > > > * (I think legitimize_path is bogus) > > > > > > > > > > in path_openat > > > > > } else { > > > > > const char *s = path_init(nd, flags); > > > > > while (!(error = link_path_walk(s, nd)) && > > > > > (error = do_last(nd, file, op)) > 0) { <<<< > > > > > > > > > > do_last: > > > > > if (open_flag & O_CREAT) > > > > > inode_lock(dir->d_inode); <<<< > > > > > else > > > > > so it's trying to take inode->i_rwsem for the directory > > > > > > > > > > DENTRY INODE SUPERBLK TYPE PATH > > > > > ffff8c68bb8e79c0 ffff8c691158ef20 ffff8c6915bf9000 DIR /mnt/vm1_smb/ > > > > > inode.i_rwsem is ffff8c691158efc0 > > > > > > > > > > <struct rw_semaphore 0xffff8c691158efc0>: > > > > > owner: <struct task_struct 0xffff8c6914275d00> (UN - 8856 - > > > > > reopen_file), counter: 0x0000000000000003 > > > > > waitlist: 2 > > > > > 0xffff9965007e3c90 8863 reopen_file UN 0 > > > > > 1:29:22.926 > > > > > RWSEM_WAITING_FOR_WRITE > > > > > 0xffff996500393e00 9802 ls UN 0 > > > > > 1:17:26.700 > > > > > RWSEM_WAITING_FOR_READ > > > > > > > > > > > > > > > the owner of the inode.i_rwsem of the directory is: > > > > > > > > > > [0 00:00:00.109] [UN] PID: 8856 TASK: ffff8c6914275d00 CPU: 3 > > > > > COMMAND: "reopen_file" > > > > > #0 [ffff99650065b828] __schedule at ffffffff9b6e6095 > > > > > #1 [ffff99650065b8b8] schedule at ffffffff9b6e64df > > > > > #2 [ffff99650065b8c8] schedule_timeout at ffffffff9b6e9f89 > > > > > #3 [ffff99650065b940] msleep at ffffffff9af573a9 > > > > > #4 [ffff99650065b948] _cifsFileInfo_put.cold.63 at ffffffffc0a42dd6 > > > > > [cifs] > > > > > #5 [ffff99650065ba38] cifs_writepage_locked at ffffffffc0a0b8f3 [cifs] > > > > > #6 [ffff99650065bab0] cifs_launder_page at ffffffffc0a0bb72 [cifs] > > > > > #7 [ffff99650065bb30] invalidate_inode_pages2_range at > > > > > ffffffff9b04d4bd > > > > > #8 [ffff99650065bcb8] cifs_invalidate_mapping at ffffffffc0a11339 > > > > > [cifs] > > > > > #9 [ffff99650065bcd0] cifs_revalidate_mapping at ffffffffc0a1139a > > > > > [cifs] > > > > > #10 [ffff99650065bcf0] cifs_d_revalidate at ffffffffc0a014f6 [cifs] > > > > > #11 [ffff99650065bd08] path_openat at ffffffff9b0fe7f7 > > > > > #12 [ffff99650065bdd8] do_filp_open at ffffffff9b100a33 > > > > > #13 [ffff99650065bee0] do_sys_open at ffffffff9b0eb2d6 > > > > > #14 [ffff99650065bf38] do_syscall_64 at ffffffff9ae04315 > > > > > > > > > > cifs_launder_page is for page 0xffffd1e2c07d2480 > > > > > > > > > > crash> page.index,mapping,flags 0xffffd1e2c07d2480 > > > > > index = 0x8 > > > > > mapping = 0xffff8c68f3cd0db0 > > > > > flags = 0xfffffc0008095 > > > > > > > > > > PAGE-FLAG BIT VALUE > > > > > PG_locked 0 0000001 > > > > > PG_uptodate 2 0000004 > > > > > PG_lru 4 0000010 > > > > > PG_waiters 7 0000080 > > > > > PG_writeback 15 0008000 > > > > > > > > > > > > > > > inode is ffff8c68f3cd0c40 > > > > > inode.i_rwsem is ffff8c68f3cd0ce0 > > > > > DENTRY INODE SUPERBLK TYPE PATH > > > > > ffff8c68a1f1b480 ffff8c68f3cd0c40 ffff8c6915bf9000 REG > > > > > /mnt/vm1_smb/testfile.8853 > > > > > > > > > > > > > > > this process holds the inode->i_rwsem for the parent directory, is > > > > > laundering a page attached to the inode of the file it's opening, and > > > > > in > > > > > _cifsFileInfo_put is trying to down_write the cifsInodeInflock_sem > > > > > for the file itself. > > > > > > > > > > > > > > > <struct rw_semaphore 0xffff8c68f3cd0ce0>: > > > > > owner: <struct task_struct 0xffff8c6914272e80> (UN - 8854 - > > > > > reopen_file), counter: 0x0000000000000003 > > > > > waitlist: 1 > > > > > 0xffff9965005dfd80 8855 reopen_file UN 0 > > > > > 1:29:22.912 > > > > > RWSEM_WAITING_FOR_WRITE > > > > > > > > > > this is the inode.i_rwsem for the file > > > > > > > > > > the owner: > > > > > > > > > > [0 00:48:22.739] [UN] PID: 8854 TASK: ffff8c6914272e80 CPU: 2 > > > > > COMMAND: "reopen_file" > > > > > #0 [ffff99650054fb38] __schedule at ffffffff9b6e6095 > > > > > #1 [ffff99650054fbc8] schedule at ffffffff9b6e64df > > > > > #2 [ffff99650054fbd8] io_schedule at ffffffff9b6e68e2 > > > > > #3 [ffff99650054fbe8] __lock_page at ffffffff9b03c56f > > > > > #4 [ffff99650054fc80] pagecache_get_page at ffffffff9b03dcdf > > > > > #5 [ffff99650054fcc0] grab_cache_page_write_begin at ffffffff9b03ef4c > > > > > #6 [ffff99650054fcd0] cifs_write_begin at ffffffffc0a064ec [cifs] > > > > > #7 [ffff99650054fd30] generic_perform_write at ffffffff9b03bba4 > > > > > #8 [ffff99650054fda8] __generic_file_write_iter at ffffffff9b04060a > > > > > #9 [ffff99650054fdf0] cifs_strict_writev.cold.70 at ffffffffc0a4469b > > > > > [cifs] > > > > > #10 [ffff99650054fe48] new_sync_write at ffffffff9b0ec1dd > > > > > #11 [ffff99650054fed0] vfs_write at ffffffff9b0eed35 > > > > > #12 [ffff99650054ff00] ksys_write at ffffffff9b0eefd9 > > > > > #13 [ffff99650054ff38] do_syscall_64 at ffffffff9ae04315 > > > > > > > > > > the process holds the inode->i_rwsem for the file to which it's > > > > > writing, > > > > > and is trying to __lock_page for the same page as in the other > > > > > processes > > > > > > > > > > > > > > > the other tasks: > > > > > [0 00:00:00.028] [UN] PID: 8859 TASK: ffff8c6915479740 CPU: 2 > > > > > COMMAND: "reopen_file" > > > > > #0 [ffff9965007b39d8] __schedule at ffffffff9b6e6095 > > > > > #1 [ffff9965007b3a68] schedule at ffffffff9b6e64df > > > > > #2 [ffff9965007b3a78] schedule_timeout at ffffffff9b6e9f89 > > > > > #3 [ffff9965007b3af0] msleep at ffffffff9af573a9 > > > > > #4 [ffff9965007b3af8] cifs_new_fileinfo.cold.61 at ffffffffc0a42a07 > > > > > [cifs] > > > > > #5 [ffff9965007b3b78] cifs_open at ffffffffc0a0709d [cifs] > > > > > #6 [ffff9965007b3cd8] do_dentry_open at ffffffff9b0e9b7a > > > > > #7 [ffff9965007b3d08] path_openat at ffffffff9b0fe34f > > > > > #8 [ffff9965007b3dd8] do_filp_open at ffffffff9b100a33 > > > > > #9 [ffff9965007b3ee0] do_sys_open at ffffffff9b0eb2d6 > > > > > #10 [ffff9965007b3f38] do_syscall_64 at ffffffff9ae04315 > > > > > > > > > > this is opening the file, and is trying to down_write cinode->lock_sem > > > > > > > > > > > > > > > [0 00:00:00.041] [UN] PID: 8860 TASK: ffff8c691547ae80 CPU: 2 > > > > > COMMAND: "reopen_file" > > > > > [0 00:00:00.057] [UN] PID: 8861 TASK: ffff8c6915478000 CPU: 3 > > > > > COMMAND: "reopen_file" > > > > > [0 00:00:00.059] [UN] PID: 8858 TASK: ffff8c6914271740 CPU: 2 > > > > > COMMAND: "reopen_file" > > > > > [0 00:00:00.109] [UN] PID: 8862 TASK: ffff8c691547dd00 CPU: 6 > > > > > COMMAND: "reopen_file" > > > > > #0 [ffff9965007c3c78] __schedule at ffffffff9b6e6095 > > > > > #1 [ffff9965007c3d08] schedule at ffffffff9b6e64df > > > > > #2 [ffff9965007c3d18] schedule_timeout at ffffffff9b6e9f89 > > > > > #3 [ffff9965007c3d90] msleep at ffffffff9af573a9 > > > > > #4 [ffff9965007c3d98] _cifsFileInfo_put.cold.63 at ffffffffc0a42dd6 > > > > > [cifs] > > > > > #5 [ffff9965007c3e88] cifs_close at ffffffffc0a07aaf [cifs] > > > > > #6 [ffff9965007c3ea0] __fput at ffffffff9b0efa6e > > > > > #7 [ffff9965007c3ee8] task_work_run at ffffffff9aef1614 > > > > > #8 [ffff9965007c3f20] exit_to_usermode_loop at ffffffff9ae03d6f > > > > > #9 [ffff9965007c3f38] do_syscall_64 at ffffffff9ae0444c > > > > > > > > > > closing the file, and trying to down_write cifsi->lock_sem > > > > > > > > > > > > > > > [0 00:48:22.839] [UN] PID: 8857 TASK: ffff8c6914270000 CPU: 7 > > > > > COMMAND: "reopen_file" > > > > > #0 [ffff9965006a7cc8] __schedule at ffffffff9b6e6095 > > > > > #1 [ffff9965006a7d58] schedule at ffffffff9b6e64df > > > > > #2 [ffff9965006a7d68] io_schedule at ffffffff9b6e68e2 > > > > > #3 [ffff9965006a7d78] wait_on_page_bit at ffffffff9b03cac6 > > > > > #4 [ffff9965006a7e10] __filemap_fdatawait_range at ffffffff9b03b028 > > > > > #5 [ffff9965006a7ed8] filemap_write_and_wait at ffffffff9b040165 > > > > > #6 [ffff9965006a7ef0] cifs_flush at ffffffffc0a0c2fa [cifs] > > > > > #7 [ffff9965006a7f10] filp_close at ffffffff9b0e93f1 > > > > > #8 [ffff9965006a7f30] __x64_sys_close at ffffffff9b0e9a0e > > > > > #9 [ffff9965006a7f38] do_syscall_64 at ffffffff9ae04315 > > > > > > > > > > in __filemap_fdatawait_range > > > > > wait_on_page_writeback(page); > > > > > for the same page of the file > > > > > > > > > > > > > > > > > > > > [0 00:48:22.718] [UN] PID: 8855 TASK: ffff8c69142745c0 CPU: 7 > > > > > COMMAND: "reopen_file" > > > > > #0 [ffff9965005dfc98] __schedule at ffffffff9b6e6095 > > > > > #1 [ffff9965005dfd28] schedule at ffffffff9b6e64df > > > > > #2 [ffff9965005dfd38] rwsem_down_write_slowpath at ffffffff9af283d7 > > > > > #3 [ffff9965005dfdf0] cifs_strict_writev at ffffffffc0a0c40a [cifs] > > > > > #4 [ffff9965005dfe48] new_sync_write at ffffffff9b0ec1dd > > > > > #5 [ffff9965005dfed0] vfs_write at ffffffff9b0eed35 > > > > > #6 [ffff9965005dff00] ksys_write at ffffffff9b0eefd9 > > > > > #7 [ffff9965005dff38] do_syscall_64 at ffffffff9ae04315 > > > > > > > > > > inode_lock(inode); > > > > > > > > > > > > > > > and one 'ls' later on, to see whether the rest of the mount is > > > > > available > > > > > (the test file is in the root, so we get blocked up on the directory > > > > > ->i_rwsem), so the entire mount is unavailable > > > > > > > > > > [0 00:36:26.473] [UN] PID: 9802 TASK: ffff8c691436ae80 CPU: 4 > > > > > COMMAND: "ls" > > > > > #0 [ffff996500393d28] __schedule at ffffffff9b6e6095 > > > > > #1 [ffff996500393db8] schedule at ffffffff9b6e64df > > > > > #2 [ffff996500393dc8] rwsem_down_read_slowpath at ffffffff9b6e9421 > > > > > #3 [ffff996500393e78] down_read_killable at ffffffff9b6e95e2 > > > > > #4 [ffff996500393e88] iterate_dir at ffffffff9b103c56 > > > > > #5 [ffff996500393ec8] ksys_getdents64 at ffffffff9b104b0c > > > > > #6 [ffff996500393f30] __x64_sys_getdents64 at ffffffff9b104bb6 > > > > > #7 [ffff996500393f38] do_syscall_64 at ffffffff9ae04315 > > > > > > > > > > in iterate_dir: > > > > > if (shared) > > > > > res = down_read_killable(&inode->i_rwsem); <<<< > > > > > else > > > > > res = down_write_killable(&inode->i_rwsem); > > > > > > > > > > > > > Signed-off-by: Ronnie Sahlberg <lsahlber@redhat.com> > > > > --- > > > > fs/cifs/cifsfs.c | 13 +++++++- > > > > fs/cifs/cifsglob.h | 3 +- > > > > fs/cifs/file.c | 91 > > > > ++++++++++++++++++++++++++++++++++-------------------- > > > > 3 files changed, 72 insertions(+), 35 deletions(-) > > > > > > > > diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c > > > > index e4e3b573d20c..f8e201c45ccb 100644 > > > > --- a/fs/cifs/cifsfs.c > > > > +++ b/fs/cifs/cifsfs.c > > > > @@ -119,6 +119,7 @@ extern mempool_t *cifs_mid_poolp; > > > > > > > > struct workqueue_struct *cifsiod_wq; > > > > struct workqueue_struct *decrypt_wq; > > > > +struct workqueue_struct *fileinfo_put_wq; > > > > struct workqueue_struct *cifsoplockd_wq; > > > > __u32 cifs_lock_secret; > > > > > > > > @@ -1557,11 +1558,18 @@ init_cifs(void) > > > > goto out_destroy_cifsiod_wq; > > > > } > > > > > > > > + fileinfo_put_wq = alloc_workqueue("cifsfileinfoput", > > > > + > > > > WQ_UNBOUND|WQ_FREEZABLE|WQ_MEM_RECLAIM, > > > > 0); > > > > + if (!fileinfo_put_wq) { > > > > + rc = -ENOMEM; > > > > + goto out_destroy_decrypt_wq; > > > > + } > > > > + > > > > cifsoplockd_wq = alloc_workqueue("cifsoplockd", > > > > WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); > > > > if (!cifsoplockd_wq) { > > > > rc = -ENOMEM; > > > > - goto out_destroy_decrypt_wq; > > > > + goto out_destroy_fileinfo_put_wq; > > > > } > > > > > > > > rc = cifs_fscache_register(); > > > > @@ -1627,6 +1635,8 @@ init_cifs(void) > > > > cifs_fscache_unregister(); > > > > out_destroy_cifsoplockd_wq: > > > > destroy_workqueue(cifsoplockd_wq); > > > > +out_destroy_fileinfo_put_wq: > > > > + destroy_workqueue(fileinfo_put_wq); > > > > out_destroy_decrypt_wq: > > > > destroy_workqueue(decrypt_wq); > > > > out_destroy_cifsiod_wq: > > > > @@ -1656,6 +1666,7 @@ exit_cifs(void) > > > > cifs_fscache_unregister(); > > > > destroy_workqueue(cifsoplockd_wq); > > > > destroy_workqueue(decrypt_wq); > > > > + destroy_workqueue(fileinfo_put_wq); > > > > destroy_workqueue(cifsiod_wq); > > > > cifs_proc_clean(); > > > > } > > > > diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h > > > > index d78bfcc19156..0819b6498a3c 100644 > > > > --- a/fs/cifs/cifsglob.h > > > > +++ b/fs/cifs/cifsglob.h > > > > @@ -1265,6 +1265,7 @@ struct cifsFileInfo { > > > > struct mutex fh_mutex; /* prevents reopen race after dead ses*/ > > > > struct cifs_search_info srch_inf; > > > > struct work_struct oplock_break; /* work for oplock breaks */ > > > > + struct work_struct put; /* work for the final part of _put */ > > > > }; > > > > > > > > struct cifs_io_parms { > > > > @@ -1370,7 +1371,6 @@ cifsFileInfo_get_locked(struct cifsFileInfo > > > > *cifs_file) > > > > } > > > > > > > > struct cifsFileInfo *cifsFileInfo_get(struct cifsFileInfo *cifs_file); > > > > -void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool > > > > wait_oplock_hdlr); > > > > void cifsFileInfo_put(struct cifsFileInfo *cifs_file); > > > > > > > > #define CIFS_CACHE_READ_FLG 1 > > > > @@ -1907,6 +1907,7 @@ void cifs_queue_oplock_break(struct cifsFileInfo > > > > *cfile); > > > > extern const struct slow_work_ops cifs_oplock_break_ops; > > > > extern struct workqueue_struct *cifsiod_wq; > > > > extern struct workqueue_struct *decrypt_wq; > > > > +extern struct workqueue_struct *fileinfo_put_wq; > > > > extern struct workqueue_struct *cifsoplockd_wq; > > > > extern __u32 cifs_lock_secret; > > > > > > > > diff --git a/fs/cifs/file.c b/fs/cifs/file.c > > > > index 67e7d0ffe385..d1dedb8fb31f 100644 > > > > --- a/fs/cifs/file.c > > > > +++ b/fs/cifs/file.c > > > > @@ -375,30 +375,7 @@ cifsFileInfo_get(struct cifsFileInfo *cifs_file) > > > > return cifs_file; > > > > } > > > > > > > > -/** > > > > - * cifsFileInfo_put - release a reference of file priv data > > > > - * > > > > - * Always potentially wait for oplock handler. See _cifsFileInfo_put(). > > > > - */ > > > > -void cifsFileInfo_put(struct cifsFileInfo *cifs_file) > > > > -{ > > > > - _cifsFileInfo_put(cifs_file, true); > > > > -} > > > > - > > > > -/** > > > > - * _cifsFileInfo_put - release a reference of file priv data > > > > - * > > > > - * This may involve closing the filehandle @cifs_file out on the > > > > - * server. Must be called without holding tcon->open_file_lock and > > > > - * cifs_file->file_info_lock. > > > > - * > > > > - * If @wait_for_oplock_handler is true and we are releasing the last > > > > - * reference, wait for any running oplock break handler of the file > > > > - * and cancel any pending one. If calling this function from the > > > > - * oplock break handler, you need to pass false. > > > > - * > > > > - */ > > > > -void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool > > > > wait_oplock_handler) > > > > +void cifsFileInfo_put_final(struct cifsFileInfo *cifs_file) > > > > { > > > > struct inode *inode = d_inode(cifs_file->dentry); > > > > struct cifs_tcon *tcon = tlink_tcon(cifs_file->tlink); > > > > @@ -409,7 +386,6 @@ void _cifsFileInfo_put(struct cifsFileInfo > > > > *cifs_file, bool wait_oplock_handler) > > > > struct cifsLockInfo *li, *tmp; > > > > struct cifs_fid fid; > > > > struct cifs_pending_open open; > > > > - bool oplock_break_cancelled; > > > > > > > > spin_lock(&tcon->open_file_lock); > > > > spin_lock(&cifsi->open_file_lock); > > > > @@ -449,9 +425,6 @@ void _cifsFileInfo_put(struct cifsFileInfo > > > > *cifs_file, bool wait_oplock_handler) > > > > spin_unlock(&cifsi->open_file_lock); > > > > spin_unlock(&tcon->open_file_lock); > > > > > > > > - oplock_break_cancelled = wait_oplock_handler ? > > > > - cancel_work_sync(&cifs_file->oplock_break) : false; > > > > - > > > > if (!tcon->need_reconnect && !cifs_file->invalidHandle) { > > > > struct TCP_Server_Info *server = tcon->ses->server; > > > > unsigned int xid; > > > > @@ -462,9 +435,6 @@ void _cifsFileInfo_put(struct cifsFileInfo > > > > *cifs_file, bool wait_oplock_handler) > > > > _free_xid(xid); > > > > } > > > > > > > > - if (oplock_break_cancelled) > > > > - cifs_done_oplock_break(cifsi); > > > > - > > > > cifs_del_pending_open(&open); > > > > > > > > /* > > > > @@ -487,6 +457,61 @@ void _cifsFileInfo_put(struct cifsFileInfo > > > > *cifs_file, bool wait_oplock_handler) > > > > kfree(cifs_file); > > > > } > > > > > > > > +static void cifsFileInfo_put_work(struct work_struct *work) > > > > +{ > > > > + struct cifsFileInfo *cifs_file = container_of(work, > > > > + struct cifsFileInfo, put); > > > > + > > > > + cifsFileInfo_put_final(cifs_file); > > > > +} > > > > + > > > > +/** > > > > + * _cifsFileInfo_put - release a reference of file priv data > > > > + * > > > > + * This may involve closing the filehandle @cifs_file out on the > > > > + * server. Must be called without holding tcon->open_file_lock, > > > > + * cinode->open_file_lock and cifs_file->file_info_lock. > > > > + * > > > > + * If @wait_for_oplock_handler is true and we are releasing the last > > > > + * reference, wait for any running oplock break handler of the file > > > > + * and cancel any pending one. If calling this function from the > > > > + * oplock break handler, you need to pass false. > > > > + * > > > > + */ > > > > +void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, > > > > + bool wait_oplock_handler, bool offload) > > > > +{ > > > > + struct inode *inode = d_inode(cifs_file->dentry); > > > > + struct cifsInodeInfo *cifsi = CIFS_I(inode); > > > > + > > > > + spin_lock(&cifs_file->file_info_lock); > > > > + if (cifs_file->count > 1) { > > > > + cifs_file->count--; > > > > + spin_unlock(&cifs_file->file_info_lock); > > > > + return; > > > > + } > > > > + spin_unlock(&cifs_file->file_info_lock); > > > > + > > > > + if (wait_oplock_handler && > > > > cancel_work_sync(&cifs_file->oplock_break)) > > > > + cifs_done_oplock_break(cifsi); > > > > > > At this point the file info is still in the lists and can be > > > referenced by other threads not only including oplock break routine. > > > So, we can't cancel oplock break here - it would lead to big timeouts > > > to other clients trying to open the file because the server will be > > > waiting for an oplock break ack. > > > > > > Let's do it in cifsFileInfo_put_final() as it was originally in > > > cifsFileInfo_put() and we do not need an extra *wait_oplock_handler* > > > argument - just cancel it unconditionally. If a caller of this > > > function don't want to deadlock on himself (like cifs_oplock_break) it > > > will just specify *offload* as True. > > > > > > > + > > > > + if (offload) { > > > > + INIT_WORK(&cifs_file->put, cifsFileInfo_put_work); > > > > INIT_WORK should be done when initializing cifsFileInfo structure. > > > > > > > > + queue_work(fileinfo_put_wq, &cifs_file->put); > > > > As I mentioned in the parallel discussion queue_work here may return > > False if the same work has been already queued. It may happen in the > > following scenario: > > > > T1 gets reference - 1 > > T1 is in putting the reference - checked ref > 1 > > T2 gets reference - 2 > > T1 queues work > > T3 (worker) puts the reference - 1 but still didn't exit and remains > > in the queue > > T2 is in putting the reference - checked ref >1 > > T2 queues work and it returns false (!) <--- error > > > > Sorry, it wasn't obvious to me when I was suggesting this yesterday. > > > > One way to fix this is to check queue_work output and "goto" to the > > beginning of the function if False. But there is always a probability > > that we end up in the same situation and it repeats and repeats again. > > I think it should resolve itself at some point in time but it doesn't > > look like a good approach. > > > > Another way to fix it is to revert the patch to the state it was at > > yesterday - to offload only the remaining part of cifsFileInfo_put() > > which will allow us to avoid the race from above. I am personally > > leaning towards this way. > > I think so too. > Can you do the changes below (wait_oplock_handler/INIT_WORK) to yesterday's patch for how you want them to look like and send it to the list? We can leave wait_oplock_handler cleanup for a follow-on work. As for INIT_WORK, please find a fix (on top of your patch) below. It basically does the same as we do for cifs_oplock_break work today. ----------------------------- diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 309d6948d2f4..3b077074cc1f 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -288,6 +288,8 @@ cifs_down_write(struct rw_semaphore *sem) msleep(10); } +static void cifsFileInfo_put_work(struct work_struct *work); + struct cifsFileInfo * cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, struct tcon_link *tlink, __u32 oplock) @@ -325,6 +327,7 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, cfile->invalidHandle = false; cfile->tlink = cifs_get_tlink(tlink); INIT_WORK(&cfile->oplock_break, cifs_oplock_break); + INIT_WORK(&cfile->put, cifsFileInfo_put_work); mutex_init(&cfile->fh_mutex); spin_lock_init(&cfile->file_info_lock); @@ -417,7 +420,7 @@ static void cifsFileInfo_put_work(struct work_struct *work) */ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) { - _cifsFileInfo_put(cifs_file, true, true); + _cifsFileInfo_put(cifs_file, true, true); } /** @@ -502,10 +505,9 @@ void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, cifs_del_pending_open(&open); - if (offload) { - INIT_WORK(&cifs_file->put, cifsFileInfo_put_work); + if (offload) queue_work(fileinfo_put_wq, &cifs_file->put); - } else + else cifsFileInfo_put_final(cifs_file); } ----------------------------- Other than that, I think the patch is good to go. Once you fix the above, please feel free to add my Reviewed-by: Pavel Shilovsky <pshilov@microsoft.com> Thanks for fixing this problem! -- Best regards, Pavel Shilovsky
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index e4e3b573d20c..f8e201c45ccb 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -119,6 +119,7 @@ extern mempool_t *cifs_mid_poolp; struct workqueue_struct *cifsiod_wq; struct workqueue_struct *decrypt_wq; +struct workqueue_struct *fileinfo_put_wq; struct workqueue_struct *cifsoplockd_wq; __u32 cifs_lock_secret; @@ -1557,11 +1558,18 @@ init_cifs(void) goto out_destroy_cifsiod_wq; } + fileinfo_put_wq = alloc_workqueue("cifsfileinfoput", + WQ_UNBOUND|WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); + if (!fileinfo_put_wq) { + rc = -ENOMEM; + goto out_destroy_decrypt_wq; + } + cifsoplockd_wq = alloc_workqueue("cifsoplockd", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); if (!cifsoplockd_wq) { rc = -ENOMEM; - goto out_destroy_decrypt_wq; + goto out_destroy_fileinfo_put_wq; } rc = cifs_fscache_register(); @@ -1627,6 +1635,8 @@ init_cifs(void) cifs_fscache_unregister(); out_destroy_cifsoplockd_wq: destroy_workqueue(cifsoplockd_wq); +out_destroy_fileinfo_put_wq: + destroy_workqueue(fileinfo_put_wq); out_destroy_decrypt_wq: destroy_workqueue(decrypt_wq); out_destroy_cifsiod_wq: @@ -1656,6 +1666,7 @@ exit_cifs(void) cifs_fscache_unregister(); destroy_workqueue(cifsoplockd_wq); destroy_workqueue(decrypt_wq); + destroy_workqueue(fileinfo_put_wq); destroy_workqueue(cifsiod_wq); cifs_proc_clean(); } diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index d78bfcc19156..0819b6498a3c 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -1265,6 +1265,7 @@ struct cifsFileInfo { struct mutex fh_mutex; /* prevents reopen race after dead ses*/ struct cifs_search_info srch_inf; struct work_struct oplock_break; /* work for oplock breaks */ + struct work_struct put; /* work for the final part of _put */ }; struct cifs_io_parms { @@ -1370,7 +1371,6 @@ cifsFileInfo_get_locked(struct cifsFileInfo *cifs_file) } struct cifsFileInfo *cifsFileInfo_get(struct cifsFileInfo *cifs_file); -void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_hdlr); void cifsFileInfo_put(struct cifsFileInfo *cifs_file); #define CIFS_CACHE_READ_FLG 1 @@ -1907,6 +1907,7 @@ void cifs_queue_oplock_break(struct cifsFileInfo *cfile); extern const struct slow_work_ops cifs_oplock_break_ops; extern struct workqueue_struct *cifsiod_wq; extern struct workqueue_struct *decrypt_wq; +extern struct workqueue_struct *fileinfo_put_wq; extern struct workqueue_struct *cifsoplockd_wq; extern __u32 cifs_lock_secret; diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 67e7d0ffe385..d1dedb8fb31f 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -375,30 +375,7 @@ cifsFileInfo_get(struct cifsFileInfo *cifs_file) return cifs_file; } -/** - * cifsFileInfo_put - release a reference of file priv data - * - * Always potentially wait for oplock handler. See _cifsFileInfo_put(). - */ -void cifsFileInfo_put(struct cifsFileInfo *cifs_file) -{ - _cifsFileInfo_put(cifs_file, true); -} - -/** - * _cifsFileInfo_put - release a reference of file priv data - * - * This may involve closing the filehandle @cifs_file out on the - * server. Must be called without holding tcon->open_file_lock and - * cifs_file->file_info_lock. - * - * If @wait_for_oplock_handler is true and we are releasing the last - * reference, wait for any running oplock break handler of the file - * and cancel any pending one. If calling this function from the - * oplock break handler, you need to pass false. - * - */ -void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_handler) +void cifsFileInfo_put_final(struct cifsFileInfo *cifs_file) { struct inode *inode = d_inode(cifs_file->dentry); struct cifs_tcon *tcon = tlink_tcon(cifs_file->tlink); @@ -409,7 +386,6 @@ void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_handler) struct cifsLockInfo *li, *tmp; struct cifs_fid fid; struct cifs_pending_open open; - bool oplock_break_cancelled; spin_lock(&tcon->open_file_lock); spin_lock(&cifsi->open_file_lock); @@ -449,9 +425,6 @@ void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_handler) spin_unlock(&cifsi->open_file_lock); spin_unlock(&tcon->open_file_lock); - oplock_break_cancelled = wait_oplock_handler ? - cancel_work_sync(&cifs_file->oplock_break) : false; - if (!tcon->need_reconnect && !cifs_file->invalidHandle) { struct TCP_Server_Info *server = tcon->ses->server; unsigned int xid; @@ -462,9 +435,6 @@ void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_handler) _free_xid(xid); } - if (oplock_break_cancelled) - cifs_done_oplock_break(cifsi); - cifs_del_pending_open(&open); /* @@ -487,6 +457,61 @@ void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_handler) kfree(cifs_file); } +static void cifsFileInfo_put_work(struct work_struct *work) +{ + struct cifsFileInfo *cifs_file = container_of(work, + struct cifsFileInfo, put); + + cifsFileInfo_put_final(cifs_file); +} + +/** + * _cifsFileInfo_put - release a reference of file priv data + * + * This may involve closing the filehandle @cifs_file out on the + * server. Must be called without holding tcon->open_file_lock, + * cinode->open_file_lock and cifs_file->file_info_lock. + * + * If @wait_for_oplock_handler is true and we are releasing the last + * reference, wait for any running oplock break handler of the file + * and cancel any pending one. If calling this function from the + * oplock break handler, you need to pass false. + * + */ +void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, + bool wait_oplock_handler, bool offload) +{ + struct inode *inode = d_inode(cifs_file->dentry); + struct cifsInodeInfo *cifsi = CIFS_I(inode); + + spin_lock(&cifs_file->file_info_lock); + if (cifs_file->count > 1) { + cifs_file->count--; + spin_unlock(&cifs_file->file_info_lock); + return; + } + spin_unlock(&cifs_file->file_info_lock); + + if (wait_oplock_handler && cancel_work_sync(&cifs_file->oplock_break)) + cifs_done_oplock_break(cifsi); + + if (offload) { + INIT_WORK(&cifs_file->put, cifsFileInfo_put_work); + queue_work(fileinfo_put_wq, &cifs_file->put); + } else + cifsFileInfo_put_final(cifs_file); +} + +/** + * cifsFileInfo_put - release a reference of file priv data + * + * Always potentially wait for oplock handler. See _cifsFileInfo_put(). + */ +void cifsFileInfo_put(struct cifsFileInfo *cifs_file) +{ + _cifsFileInfo_put(cifs_file, true, true); +} + int cifs_open(struct inode *inode, struct file *file) { @@ -808,7 +833,7 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush) int cifs_close(struct inode *inode, struct file *file) { if (file->private_data != NULL) { - cifsFileInfo_put(file->private_data); + _cifsFileInfo_put(file->private_data, true, false); file->private_data = NULL; } @@ -4742,7 +4767,7 @@ void cifs_oplock_break(struct work_struct *work) cinode); cifs_dbg(FYI, "Oplock release rc = %d\n", rc); } - _cifsFileInfo_put(cfile, false /* do not wait for ourself */); + _cifsFileInfo_put(cfile, false /* do not wait for ourself */, true); cifs_done_oplock_break(cinode); }