Message ID | 20210826075941.28480-1-ghe@suse.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | ocfs2: avoid getting dlm lock of the target directory multiple times during reflink process | expand |
Hello Joseph and Wengang, When you have time, please help review this patch. About the deadlock problem which was caused by ocfs2_downconvert_lock failure, we have the fix patch, it is very key. But I feel this patch is still useful as a optimization patch, the user case is to reflink the files to the same directory concurrently, our users usually backup the files(via reflink) from the cluster nodes concurrently(via crontab) every day/hour. The current design, during the reflink process, the node will acquire/release dlm lock of the target directory multiple times, this is very inefficient in concurrently reflink. Thanks Gang On 2021/8/26 15:59, Gang He wrote: > During the reflink process, we should acquire the target directory > inode dlm lock at the beginning, and hold this dlm lock until end > of the function. > With this patch, we avoid dlm lock ping-pong effect when clone > files to the same directory simultaneously from multiple nodes. > There is a typical user scenario, users regularly back up files > to a specified directory through the reflink feature from the > multiple nodes. > > Signed-off-by: Gang He <ghe@suse.com> > --- > fs/ocfs2/namei.c | 32 +++++++++++++------------------- > fs/ocfs2/namei.h | 2 ++ > fs/ocfs2/refcounttree.c | 15 +++++++++++---- > fs/ocfs2/xattr.c | 12 +----------- > fs/ocfs2/xattr.h | 1 + > 5 files changed, 28 insertions(+), 34 deletions(-) > > diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c > index 2c46ff6ba4ea..f8bbb22cc60b 100644 > --- a/fs/ocfs2/namei.c > +++ b/fs/ocfs2/namei.c > @@ -2489,6 +2489,7 @@ static int ocfs2_prep_new_orphaned_file(struct inode *dir, > } > > int ocfs2_create_inode_in_orphan(struct inode *dir, > + struct buffer_head **dir_bh, > int mode, > struct inode **new_inode) > { > @@ -2597,13 +2598,16 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, > > brelse(new_di_bh); > > - if (!status) > - *new_inode = inode; > - > ocfs2_free_dir_lookup_result(&orphan_insert); > > - ocfs2_inode_unlock(dir, 1); > - brelse(parent_di_bh); > + if (!status) { > + *new_inode = inode; > + *dir_bh = parent_di_bh; > + } else { > + ocfs2_inode_unlock(dir, 1); > + brelse(parent_di_bh); > + } > + > return status; > } > > @@ -2760,11 +2764,11 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb, > } > > int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, > + struct buffer_head *dir_bh, > struct inode *inode, > struct dentry *dentry) > { > int status = 0; > - struct buffer_head *parent_di_bh = NULL; > handle_t *handle = NULL; > struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); > struct ocfs2_dinode *dir_di, *di; > @@ -2778,14 +2782,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, > (unsigned long long)OCFS2_I(dir)->ip_blkno, > (unsigned long long)OCFS2_I(inode)->ip_blkno); > > - status = ocfs2_inode_lock(dir, &parent_di_bh, 1); > - if (status < 0) { > - if (status != -ENOENT) > - mlog_errno(status); > - return status; > - } > - > - dir_di = (struct ocfs2_dinode *) parent_di_bh->b_data; > + dir_di = (struct ocfs2_dinode *) dir_bh->b_data; > if (!dir_di->i_links_count) { > /* can't make a file in a deleted directory. */ > status = -ENOENT; > @@ -2798,7 +2795,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, > goto leave; > > /* get a spot inside the dir. */ > - status = ocfs2_prepare_dir_for_insert(osb, dir, parent_di_bh, > + status = ocfs2_prepare_dir_for_insert(osb, dir, dir_bh, > dentry->d_name.name, > dentry->d_name.len, &lookup); > if (status < 0) { > @@ -2862,7 +2859,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, > ocfs2_journal_dirty(handle, di_bh); > > status = ocfs2_add_entry(handle, dentry, inode, > - OCFS2_I(inode)->ip_blkno, parent_di_bh, > + OCFS2_I(inode)->ip_blkno, dir_bh, > &lookup); > if (status < 0) { > mlog_errno(status); > @@ -2886,10 +2883,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, > iput(orphan_dir_inode); > leave: > > - ocfs2_inode_unlock(dir, 1); > - > brelse(di_bh); > - brelse(parent_di_bh); > brelse(orphan_dir_bh); > > ocfs2_free_dir_lookup_result(&lookup); > diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h > index 9cc891eb874e..03a2c526e2c1 100644 > --- a/fs/ocfs2/namei.h > +++ b/fs/ocfs2/namei.h > @@ -24,6 +24,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, > struct buffer_head *orphan_dir_bh, > bool dio); > int ocfs2_create_inode_in_orphan(struct inode *dir, > + struct buffer_head **dir_bh, > int mode, > struct inode **new_inode); > int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb, > @@ -32,6 +33,7 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb, > struct inode *inode, struct buffer_head *di_bh, > int update_isize, loff_t end); > int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, > + struct buffer_head *dir_bh, > struct inode *new_inode, > struct dentry *new_dentry); > > diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c > index 7f6355cbb587..a9a0c7c37e8e 100644 > --- a/fs/ocfs2/refcounttree.c > +++ b/fs/ocfs2/refcounttree.c > @@ -4250,7 +4250,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, > { > int error, had_lock; > struct inode *inode = d_inode(old_dentry); > - struct buffer_head *old_bh = NULL; > + struct buffer_head *old_bh = NULL, *dir_bh = NULL; > struct inode *new_orphan_inode = NULL; > struct ocfs2_lock_holder oh; > > @@ -4258,7 +4258,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, > return -EOPNOTSUPP; > > > - error = ocfs2_create_inode_in_orphan(dir, inode->i_mode, > + error = ocfs2_create_inode_in_orphan(dir, &dir_bh, inode->i_mode, > &new_orphan_inode); > if (error) { > mlog_errno(error); > @@ -4304,13 +4304,15 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, > > /* If the security isn't preserved, we need to re-initialize them. */ > if (!preserve) { > - error = ocfs2_init_security_and_acl(dir, new_orphan_inode, > + error = ocfs2_init_security_and_acl(dir, dir_bh, > + new_orphan_inode, > &new_dentry->d_name); > if (error) > mlog_errno(error); > } > if (!error) { > - error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode, > + error = ocfs2_mv_orphaned_inode_to_new(dir, dir_bh, > + new_orphan_inode, > new_dentry); > if (error) > mlog_errno(error); > @@ -4328,6 +4330,11 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, > iput(new_orphan_inode); > } > > + if (dir_bh) { > + ocfs2_inode_unlock(dir, 1); > + brelse(dir_bh); > + } > + > return error; > } > > diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c > index dd784eb0cd7c..3f23e3a5018c 100644 > --- a/fs/ocfs2/xattr.c > +++ b/fs/ocfs2/xattr.c > @@ -7203,16 +7203,13 @@ int ocfs2_reflink_xattrs(struct inode *old_inode, > /* > * Initialize security and acl for a already created inode. > * Used for reflink a non-preserve-security file. > - * > - * It uses common api like ocfs2_xattr_set, so the caller > - * must not hold any lock expect i_mutex. > */ > int ocfs2_init_security_and_acl(struct inode *dir, > + struct buffer_head *dir_bh, > struct inode *inode, > const struct qstr *qstr) > { > int ret = 0; > - struct buffer_head *dir_bh = NULL; > > ret = ocfs2_init_security_get(inode, dir, qstr, NULL); > if (ret) { > @@ -7220,17 +7217,10 @@ int ocfs2_init_security_and_acl(struct inode *dir, > goto leave; > } > > - ret = ocfs2_inode_lock(dir, &dir_bh, 0); > - if (ret) { > - mlog_errno(ret); > - goto leave; > - } > ret = ocfs2_init_acl(NULL, inode, dir, NULL, dir_bh, NULL, NULL); > if (ret) > mlog_errno(ret); > > - ocfs2_inode_unlock(dir, 0); > - brelse(dir_bh); > leave: > return ret; > } > diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h > index 00308b57f64f..b27fd8ba0019 100644 > --- a/fs/ocfs2/xattr.h > +++ b/fs/ocfs2/xattr.h > @@ -83,6 +83,7 @@ int ocfs2_reflink_xattrs(struct inode *old_inode, > struct buffer_head *new_bh, > bool preserve_security); > int ocfs2_init_security_and_acl(struct inode *dir, > + struct buffer_head *dir_bh, > struct inode *inode, > const struct qstr *qstr); > #endif /* OCFS2_XATTR_H */ >
On 8/31/21 2:25 PM, Gang He wrote: > Hello Joseph and Wengang, > > When you have time, please help review this patch. > About the deadlock problem which was caused by ocfs2_downconvert_lock failure, we have the fix patch, it is very key. > But I feel this patch is still useful as a optimization patch, the user > case is to reflink the files to the same directory concurrently, our users usually backup the files(via reflink) from the cluster nodes concurrently(via crontab) every day/hour. > The current design, during the reflink process, the node will acquire/release dlm lock of the target directory multiple times, > this is very inefficient in concurrently reflink. > Sure, I'll spend some time this week. Thanks, Joseph
Hi Gang, On 8/26/21 3:59 PM, Gang He wrote: > During the reflink process, we should acquire the target directory > inode dlm lock at the beginning, and hold this dlm lock until end > of the function. > With this patch, we avoid dlm lock ping-pong effect when clone > files to the same directory simultaneously from multiple nodes. > There is a typical user scenario, users regularly back up files > to a specified directory through the reflink feature from the > multiple nodes. > Since now it take dir inode lock across the whole reflink, it may impact other concurrent operations under the same directory. Have you evaluated such cases? Thanks, Joseph
Hi Joseph, On 2021/9/6 19:14, Joseph Qi wrote: > Hi Gang, > > > On 8/26/21 3:59 PM, Gang He wrote: >> During the reflink process, we should acquire the target directory >> inode dlm lock at the beginning, and hold this dlm lock until end >> of the function. >> With this patch, we avoid dlm lock ping-pong effect when clone >> files to the same directory simultaneously from multiple nodes. >> There is a typical user scenario, users regularly back up files >> to a specified directory through the reflink feature from the >> multiple nodes. >> > Since now it take dir inode lock across the whole reflink, it may impact > other concurrent operations under the same directory. > Have you evaluated such cases? So far, I do not find there is any risk for taking dir inode lock across the reflink function, it will block getting that dir inode lock a little time, but avoid ping-pong effect during the reflink function. Thanks Gang > > Thanks, > Joseph > >
Hi Gang, Sure, I will look into the problem you are trying to address. Any bug fix and performance improvement is welcomed! Well, can you please provide the analysis on the tcpdumps between the (two) nodes that covers the reflink operation with/without your patch to show how you saved dlm locking ping-pongs? And what cases did you test to get better performance? thanks, wengang > On Aug 30, 2021, at 11:25 PM, Gang He <ghe@suse.com> wrote: > > Hello Joseph and Wengang, > > When you have time, please help review this patch. > About the deadlock problem which was caused by ocfs2_downconvert_lock > failure, we have the fix patch, it is very key. > But I feel this patch is still useful as a optimization patch, the user > case is to reflink the files to the same directory concurrently, our > users usually backup the files(via reflink) from the cluster nodes > concurrently(via crontab) every day/hour. > The current design, during the reflink process, the node will > acquire/release dlm lock of the target directory multiple times, > this is very inefficient in concurrently reflink. > > > Thanks > Gang > > On 2021/8/26 15:59, Gang He wrote: >> During the reflink process, we should acquire the target directory >> inode dlm lock at the beginning, and hold this dlm lock until end >> of the function. >> With this patch, we avoid dlm lock ping-pong effect when clone >> files to the same directory simultaneously from multiple nodes. >> There is a typical user scenario, users regularly back up files >> to a specified directory through the reflink feature from the >> multiple nodes. >> >> Signed-off-by: Gang He <ghe@suse.com> >> --- >> fs/ocfs2/namei.c | 32 +++++++++++++------------------- >> fs/ocfs2/namei.h | 2 ++ >> fs/ocfs2/refcounttree.c | 15 +++++++++++---- >> fs/ocfs2/xattr.c | 12 +----------- >> fs/ocfs2/xattr.h | 1 + >> 5 files changed, 28 insertions(+), 34 deletions(-) >> >> diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c >> index 2c46ff6ba4ea..f8bbb22cc60b 100644 >> --- a/fs/ocfs2/namei.c >> +++ b/fs/ocfs2/namei.c >> @@ -2489,6 +2489,7 @@ static int ocfs2_prep_new_orphaned_file(struct inode *dir, >> } >> >> int ocfs2_create_inode_in_orphan(struct inode *dir, >> + struct buffer_head **dir_bh, >> int mode, >> struct inode **new_inode) >> { >> @@ -2597,13 +2598,16 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, >> >> brelse(new_di_bh); >> >> - if (!status) >> - *new_inode = inode; >> - >> ocfs2_free_dir_lookup_result(&orphan_insert); >> >> - ocfs2_inode_unlock(dir, 1); >> - brelse(parent_di_bh); >> + if (!status) { >> + *new_inode = inode; >> + *dir_bh = parent_di_bh; >> + } else { >> + ocfs2_inode_unlock(dir, 1); >> + brelse(parent_di_bh); >> + } >> + >> return status; >> } >> >> @@ -2760,11 +2764,11 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb, >> } >> >> int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, >> + struct buffer_head *dir_bh, >> struct inode *inode, >> struct dentry *dentry) >> { >> int status = 0; >> - struct buffer_head *parent_di_bh = NULL; >> handle_t *handle = NULL; >> struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); >> struct ocfs2_dinode *dir_di, *di; >> @@ -2778,14 +2782,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, >> (unsigned long long)OCFS2_I(dir)->ip_blkno, >> (unsigned long long)OCFS2_I(inode)->ip_blkno); >> >> - status = ocfs2_inode_lock(dir, &parent_di_bh, 1); >> - if (status < 0) { >> - if (status != -ENOENT) >> - mlog_errno(status); >> - return status; >> - } >> - >> - dir_di = (struct ocfs2_dinode *) parent_di_bh->b_data; >> + dir_di = (struct ocfs2_dinode *) dir_bh->b_data; >> if (!dir_di->i_links_count) { >> /* can't make a file in a deleted directory. */ >> status = -ENOENT; >> @@ -2798,7 +2795,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, >> goto leave; >> >> /* get a spot inside the dir. */ >> - status = ocfs2_prepare_dir_for_insert(osb, dir, parent_di_bh, >> + status = ocfs2_prepare_dir_for_insert(osb, dir, dir_bh, >> dentry->d_name.name, >> dentry->d_name.len, &lookup); >> if (status < 0) { >> @@ -2862,7 +2859,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, >> ocfs2_journal_dirty(handle, di_bh); >> >> status = ocfs2_add_entry(handle, dentry, inode, >> - OCFS2_I(inode)->ip_blkno, parent_di_bh, >> + OCFS2_I(inode)->ip_blkno, dir_bh, >> &lookup); >> if (status < 0) { >> mlog_errno(status); >> @@ -2886,10 +2883,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, >> iput(orphan_dir_inode); >> leave: >> >> - ocfs2_inode_unlock(dir, 1); >> - >> brelse(di_bh); >> - brelse(parent_di_bh); >> brelse(orphan_dir_bh); >> >> ocfs2_free_dir_lookup_result(&lookup); >> diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h >> index 9cc891eb874e..03a2c526e2c1 100644 >> --- a/fs/ocfs2/namei.h >> +++ b/fs/ocfs2/namei.h >> @@ -24,6 +24,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, >> struct buffer_head *orphan_dir_bh, >> bool dio); >> int ocfs2_create_inode_in_orphan(struct inode *dir, >> + struct buffer_head **dir_bh, >> int mode, >> struct inode **new_inode); >> int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb, >> @@ -32,6 +33,7 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb, >> struct inode *inode, struct buffer_head *di_bh, >> int update_isize, loff_t end); >> int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, >> + struct buffer_head *dir_bh, >> struct inode *new_inode, >> struct dentry *new_dentry); >> >> diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c >> index 7f6355cbb587..a9a0c7c37e8e 100644 >> --- a/fs/ocfs2/refcounttree.c >> +++ b/fs/ocfs2/refcounttree.c >> @@ -4250,7 +4250,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, >> { >> int error, had_lock; >> struct inode *inode = d_inode(old_dentry); >> - struct buffer_head *old_bh = NULL; >> + struct buffer_head *old_bh = NULL, *dir_bh = NULL; >> struct inode *new_orphan_inode = NULL; >> struct ocfs2_lock_holder oh; >> >> @@ -4258,7 +4258,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, >> return -EOPNOTSUPP; >> >> >> - error = ocfs2_create_inode_in_orphan(dir, inode->i_mode, >> + error = ocfs2_create_inode_in_orphan(dir, &dir_bh, inode->i_mode, >> &new_orphan_inode); >> if (error) { >> mlog_errno(error); >> @@ -4304,13 +4304,15 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, >> >> /* If the security isn't preserved, we need to re-initialize them. */ >> if (!preserve) { >> - error = ocfs2_init_security_and_acl(dir, new_orphan_inode, >> + error = ocfs2_init_security_and_acl(dir, dir_bh, >> + new_orphan_inode, >> &new_dentry->d_name); >> if (error) >> mlog_errno(error); >> } >> if (!error) { >> - error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode, >> + error = ocfs2_mv_orphaned_inode_to_new(dir, dir_bh, >> + new_orphan_inode, >> new_dentry); >> if (error) >> mlog_errno(error); >> @@ -4328,6 +4330,11 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, >> iput(new_orphan_inode); >> } >> >> + if (dir_bh) { >> + ocfs2_inode_unlock(dir, 1); >> + brelse(dir_bh); >> + } >> + >> return error; >> } >> >> diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c >> index dd784eb0cd7c..3f23e3a5018c 100644 >> --- a/fs/ocfs2/xattr.c >> +++ b/fs/ocfs2/xattr.c >> @@ -7203,16 +7203,13 @@ int ocfs2_reflink_xattrs(struct inode *old_inode, >> /* >> * Initialize security and acl for a already created inode. >> * Used for reflink a non-preserve-security file. >> - * >> - * It uses common api like ocfs2_xattr_set, so the caller >> - * must not hold any lock expect i_mutex. >> */ >> int ocfs2_init_security_and_acl(struct inode *dir, >> + struct buffer_head *dir_bh, >> struct inode *inode, >> const struct qstr *qstr) >> { >> int ret = 0; >> - struct buffer_head *dir_bh = NULL; >> >> ret = ocfs2_init_security_get(inode, dir, qstr, NULL); >> if (ret) { >> @@ -7220,17 +7217,10 @@ int ocfs2_init_security_and_acl(struct inode *dir, >> goto leave; >> } >> >> - ret = ocfs2_inode_lock(dir, &dir_bh, 0); >> - if (ret) { >> - mlog_errno(ret); >> - goto leave; >> - } >> ret = ocfs2_init_acl(NULL, inode, dir, NULL, dir_bh, NULL, NULL); >> if (ret) >> mlog_errno(ret); >> >> - ocfs2_inode_unlock(dir, 0); >> - brelse(dir_bh); >> leave: >> return ret; >> } >> diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h >> index 00308b57f64f..b27fd8ba0019 100644 >> --- a/fs/ocfs2/xattr.h >> +++ b/fs/ocfs2/xattr.h >> @@ -83,6 +83,7 @@ int ocfs2_reflink_xattrs(struct inode *old_inode, >> struct buffer_head *new_bh, >> bool preserve_security); >> int ocfs2_init_security_and_acl(struct inode *dir, >> + struct buffer_head *dir_bh, >> struct inode *inode, >> const struct qstr *qstr); >> #endif /* OCFS2_XATTR_H */ >> > > > _______________________________________________ > Ocfs2-devel mailing list > Ocfs2-devel@oss.oracle.com > https://oss.oracle.com/mailman/listinfo/ocfs2-devel
On 2021/9/7 23:57, Wengang Wang wrote: > Hi Gang, > > Sure, I will look into the problem you are trying to address. Any bug fix and performance improvement is welcomed! > Well, can you please provide the analysis on the tcpdumps between the (two) nodes that covers the reflink operation with/without your patch to show how you saved dlm locking ping-pongs? The code change will hold the reflink destination dir inode dlm lock, release it until the whole reflink process is completed. The current code will hold/release this dlm lock three times during the reflink process. If there are some concurrently reflink operation to that directory from other nodes, the ping-pong effect is that directory inode related data will be synchronized(multiple times) to the disk when inode dlm lock is downconverted during one reflink operation. According to my test, running the reflink command to clone a file to the same directory repeatedly from three nodes, the code change can shorten the previous half of the time. Thanks Gang > > And what cases did you test to get better performance? > > thanks, > wengang > >> On Aug 30, 2021, at 11:25 PM, Gang He <ghe@suse.com> wrote: >> >> Hello Joseph and Wengang, >> >> When you have time, please help review this patch. >> About the deadlock problem which was caused by ocfs2_downconvert_lock >> failure, we have the fix patch, it is very key. >> But I feel this patch is still useful as a optimization patch, the user >> case is to reflink the files to the same directory concurrently, our >> users usually backup the files(via reflink) from the cluster nodes >> concurrently(via crontab) every day/hour. >> The current design, during the reflink process, the node will >> acquire/release dlm lock of the target directory multiple times, >> this is very inefficient in concurrently reflink. >> >> >> Thanks >> Gang >> >> On 2021/8/26 15:59, Gang He wrote: >>> During the reflink process, we should acquire the target directory >>> inode dlm lock at the beginning, and hold this dlm lock until end >>> of the function. >>> With this patch, we avoid dlm lock ping-pong effect when clone >>> files to the same directory simultaneously from multiple nodes. >>> There is a typical user scenario, users regularly back up files >>> to a specified directory through the reflink feature from the >>> multiple nodes. >>> >>> Signed-off-by: Gang He <ghe@suse.com> >>> --- >>> fs/ocfs2/namei.c | 32 +++++++++++++------------------- >>> fs/ocfs2/namei.h | 2 ++ >>> fs/ocfs2/refcounttree.c | 15 +++++++++++---- >>> fs/ocfs2/xattr.c | 12 +----------- >>> fs/ocfs2/xattr.h | 1 + >>> 5 files changed, 28 insertions(+), 34 deletions(-) >>> >>> diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c >>> index 2c46ff6ba4ea..f8bbb22cc60b 100644 >>> --- a/fs/ocfs2/namei.c >>> +++ b/fs/ocfs2/namei.c >>> @@ -2489,6 +2489,7 @@ static int ocfs2_prep_new_orphaned_file(struct inode *dir, >>> } >>> >>> int ocfs2_create_inode_in_orphan(struct inode *dir, >>> + struct buffer_head **dir_bh, >>> int mode, >>> struct inode **new_inode) >>> { >>> @@ -2597,13 +2598,16 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, >>> >>> brelse(new_di_bh); >>> >>> - if (!status) >>> - *new_inode = inode; >>> - >>> ocfs2_free_dir_lookup_result(&orphan_insert); >>> >>> - ocfs2_inode_unlock(dir, 1); >>> - brelse(parent_di_bh); >>> + if (!status) { >>> + *new_inode = inode; >>> + *dir_bh = parent_di_bh; >>> + } else { >>> + ocfs2_inode_unlock(dir, 1); >>> + brelse(parent_di_bh); >>> + } >>> + >>> return status; >>> } >>> >>> @@ -2760,11 +2764,11 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb, >>> } >>> >>> int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, >>> + struct buffer_head *dir_bh, >>> struct inode *inode, >>> struct dentry *dentry) >>> { >>> int status = 0; >>> - struct buffer_head *parent_di_bh = NULL; >>> handle_t *handle = NULL; >>> struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); >>> struct ocfs2_dinode *dir_di, *di; >>> @@ -2778,14 +2782,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, >>> (unsigned long long)OCFS2_I(dir)->ip_blkno, >>> (unsigned long long)OCFS2_I(inode)->ip_blkno); >>> >>> - status = ocfs2_inode_lock(dir, &parent_di_bh, 1); >>> - if (status < 0) { >>> - if (status != -ENOENT) >>> - mlog_errno(status); >>> - return status; >>> - } >>> - >>> - dir_di = (struct ocfs2_dinode *) parent_di_bh->b_data; >>> + dir_di = (struct ocfs2_dinode *) dir_bh->b_data; >>> if (!dir_di->i_links_count) { >>> /* can't make a file in a deleted directory. */ >>> status = -ENOENT; >>> @@ -2798,7 +2795,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, >>> goto leave; >>> >>> /* get a spot inside the dir. */ >>> - status = ocfs2_prepare_dir_for_insert(osb, dir, parent_di_bh, >>> + status = ocfs2_prepare_dir_for_insert(osb, dir, dir_bh, >>> dentry->d_name.name, >>> dentry->d_name.len, &lookup); >>> if (status < 0) { >>> @@ -2862,7 +2859,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, >>> ocfs2_journal_dirty(handle, di_bh); >>> >>> status = ocfs2_add_entry(handle, dentry, inode, >>> - OCFS2_I(inode)->ip_blkno, parent_di_bh, >>> + OCFS2_I(inode)->ip_blkno, dir_bh, >>> &lookup); >>> if (status < 0) { >>> mlog_errno(status); >>> @@ -2886,10 +2883,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, >>> iput(orphan_dir_inode); >>> leave: >>> >>> - ocfs2_inode_unlock(dir, 1); >>> - >>> brelse(di_bh); >>> - brelse(parent_di_bh); >>> brelse(orphan_dir_bh); >>> >>> ocfs2_free_dir_lookup_result(&lookup); >>> diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h >>> index 9cc891eb874e..03a2c526e2c1 100644 >>> --- a/fs/ocfs2/namei.h >>> +++ b/fs/ocfs2/namei.h >>> @@ -24,6 +24,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, >>> struct buffer_head *orphan_dir_bh, >>> bool dio); >>> int ocfs2_create_inode_in_orphan(struct inode *dir, >>> + struct buffer_head **dir_bh, >>> int mode, >>> struct inode **new_inode); >>> int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb, >>> @@ -32,6 +33,7 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb, >>> struct inode *inode, struct buffer_head *di_bh, >>> int update_isize, loff_t end); >>> int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, >>> + struct buffer_head *dir_bh, >>> struct inode *new_inode, >>> struct dentry *new_dentry); >>> >>> diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c >>> index 7f6355cbb587..a9a0c7c37e8e 100644 >>> --- a/fs/ocfs2/refcounttree.c >>> +++ b/fs/ocfs2/refcounttree.c >>> @@ -4250,7 +4250,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, >>> { >>> int error, had_lock; >>> struct inode *inode = d_inode(old_dentry); >>> - struct buffer_head *old_bh = NULL; >>> + struct buffer_head *old_bh = NULL, *dir_bh = NULL; >>> struct inode *new_orphan_inode = NULL; >>> struct ocfs2_lock_holder oh; >>> >>> @@ -4258,7 +4258,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, >>> return -EOPNOTSUPP; >>> >>> >>> - error = ocfs2_create_inode_in_orphan(dir, inode->i_mode, >>> + error = ocfs2_create_inode_in_orphan(dir, &dir_bh, inode->i_mode, >>> &new_orphan_inode); >>> if (error) { >>> mlog_errno(error); >>> @@ -4304,13 +4304,15 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, >>> >>> /* If the security isn't preserved, we need to re-initialize them. */ >>> if (!preserve) { >>> - error = ocfs2_init_security_and_acl(dir, new_orphan_inode, >>> + error = ocfs2_init_security_and_acl(dir, dir_bh, >>> + new_orphan_inode, >>> &new_dentry->d_name); >>> if (error) >>> mlog_errno(error); >>> } >>> if (!error) { >>> - error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode, >>> + error = ocfs2_mv_orphaned_inode_to_new(dir, dir_bh, >>> + new_orphan_inode, >>> new_dentry); >>> if (error) >>> mlog_errno(error); >>> @@ -4328,6 +4330,11 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, >>> iput(new_orphan_inode); >>> } >>> >>> + if (dir_bh) { >>> + ocfs2_inode_unlock(dir, 1); >>> + brelse(dir_bh); >>> + } >>> + >>> return error; >>> } >>> >>> diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c >>> index dd784eb0cd7c..3f23e3a5018c 100644 >>> --- a/fs/ocfs2/xattr.c >>> +++ b/fs/ocfs2/xattr.c >>> @@ -7203,16 +7203,13 @@ int ocfs2_reflink_xattrs(struct inode *old_inode, >>> /* >>> * Initialize security and acl for a already created inode. >>> * Used for reflink a non-preserve-security file. >>> - * >>> - * It uses common api like ocfs2_xattr_set, so the caller >>> - * must not hold any lock expect i_mutex. >>> */ >>> int ocfs2_init_security_and_acl(struct inode *dir, >>> + struct buffer_head *dir_bh, >>> struct inode *inode, >>> const struct qstr *qstr) >>> { >>> int ret = 0; >>> - struct buffer_head *dir_bh = NULL; >>> >>> ret = ocfs2_init_security_get(inode, dir, qstr, NULL); >>> if (ret) { >>> @@ -7220,17 +7217,10 @@ int ocfs2_init_security_and_acl(struct inode *dir, >>> goto leave; >>> } >>> >>> - ret = ocfs2_inode_lock(dir, &dir_bh, 0); >>> - if (ret) { >>> - mlog_errno(ret); >>> - goto leave; >>> - } >>> ret = ocfs2_init_acl(NULL, inode, dir, NULL, dir_bh, NULL, NULL); >>> if (ret) >>> mlog_errno(ret); >>> >>> - ocfs2_inode_unlock(dir, 0); >>> - brelse(dir_bh); >>> leave: >>> return ret; >>> } >>> diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h >>> index 00308b57f64f..b27fd8ba0019 100644 >>> --- a/fs/ocfs2/xattr.h >>> +++ b/fs/ocfs2/xattr.h >>> @@ -83,6 +83,7 @@ int ocfs2_reflink_xattrs(struct inode *old_inode, >>> struct buffer_head *new_bh, >>> bool preserve_security); >>> int ocfs2_init_security_and_acl(struct inode *dir, >>> + struct buffer_head *dir_bh, >>> struct inode *inode, >>> const struct qstr *qstr); >>> #endif /* OCFS2_XATTR_H */ >>> >> >> >> _______________________________________________ >> Ocfs2-devel mailing list >> Ocfs2-devel@oss.oracle.com >> https://oss.oracle.com/mailman/listinfo/ocfs2-devel >
> On Sep 7, 2021, at 11:06 PM, Gang He <ghe@suse.com> wrote: > > > > On 2021/9/7 23:57, Wengang Wang wrote: >> Hi Gang, >> Sure, I will look into the problem you are trying to address. Any bug fix and performance improvement is welcomed! >> Well, can you please provide the analysis on the tcpdumps between the (two) nodes that covers the reflink operation with/without your patch to show how you saved dlm locking ping-pongs? > The code change will hold the reflink destination dir inode dlm lock, release it until the whole reflink process is completed. > The current code will hold/release this dlm lock three times during the reflink process. If there are some concurrently reflink operation to that directory from other nodes, the ping-pong effect is that directory inode related data will be synchronized(multiple times) to the disk when inode dlm lock is downconverted during one reflink operation. > I think the above can be a good summary, but not details or the locking ping-pong procedure. I need the details to understand the situation. For details, Which locks are involved, what are the locking types, which nodes are involved. Say like this (just for an example, not related to the problem here): Node 1 Node 2 Node 3 ——————————— ——————————— ———————————————————— dir1 meta lock EX orphandir1 lock EX dir1 meta lock EX orphadir2 lock EX dir1 meta lock EX orphadir3 lock EX dir1 meta block EX (again) ……………….. > According to my test, running the reflink command to clone a file to the same directory repeatedly from three nodes, the code change can shorten the previous half of the time. > I am not sure if above is a typical use case. What else cases did you test? Did it help in case only one node do the reflink? What if there are concurrent file creating/removing operations going on under the target directory when the reflink is going on? I think when you are trying to make a performance improvement, you should provide the performance data for different test cases, like this: ———————————————————————————————— Test case desc | orig performance | performance after patched | ———————————————————————————————— test case1 (details) | perf data | perf data | ———————————————————————————————- test case2 (details) | perf data | perf data | ——————————————————————————————— …… thanks, wengang > Thanks > Gang > >> And what cases did you test to get better performance? >> thanks, >> wengang >>> On Aug 30, 2021, at 11:25 PM, Gang He <ghe@suse.com> wrote: >>> >>> Hello Joseph and Wengang, >>> >>> When you have time, please help review this patch. >>> About the deadlock problem which was caused by ocfs2_downconvert_lock >>> failure, we have the fix patch, it is very key. >>> But I feel this patch is still useful as a optimization patch, the user >>> case is to reflink the files to the same directory concurrently, our >>> users usually backup the files(via reflink) from the cluster nodes >>> concurrently(via crontab) every day/hour. >>> The current design, during the reflink process, the node will >>> acquire/release dlm lock of the target directory multiple times, >>> this is very inefficient in concurrently reflink. >>> >>> >>> Thanks >>> Gang >>> >>> On 2021/8/26 15:59, Gang He wrote: >>>> During the reflink process, we should acquire the target directory >>>> inode dlm lock at the beginning, and hold this dlm lock until end >>>> of the function. >>>> With this patch, we avoid dlm lock ping-pong effect when clone >>>> files to the same directory simultaneously from multiple nodes. >>>> There is a typical user scenario, users regularly back up files >>>> to a specified directory through the reflink feature from the >>>> multiple nodes. >>>> >>>> Signed-off-by: Gang He <ghe@suse.com> >>>> --- >>>> fs/ocfs2/namei.c | 32 +++++++++++++------------------- >>>> fs/ocfs2/namei.h | 2 ++ >>>> fs/ocfs2/refcounttree.c | 15 +++++++++++---- >>>> fs/ocfs2/xattr.c | 12 +----------- >>>> fs/ocfs2/xattr.h | 1 + >>>> 5 files changed, 28 insertions(+), 34 deletions(-) >>>> >>>> diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c >>>> index 2c46ff6ba4ea..f8bbb22cc60b 100644 >>>> --- a/fs/ocfs2/namei.c >>>> +++ b/fs/ocfs2/namei.c >>>> @@ -2489,6 +2489,7 @@ static int ocfs2_prep_new_orphaned_file(struct inode *dir, >>>> } >>>> >>>> int ocfs2_create_inode_in_orphan(struct inode *dir, >>>> + struct buffer_head **dir_bh, >>>> int mode, >>>> struct inode **new_inode) >>>> { >>>> @@ -2597,13 +2598,16 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, >>>> >>>> brelse(new_di_bh); >>>> >>>> - if (!status) >>>> - *new_inode = inode; >>>> - >>>> ocfs2_free_dir_lookup_result(&orphan_insert); >>>> >>>> - ocfs2_inode_unlock(dir, 1); >>>> - brelse(parent_di_bh); >>>> + if (!status) { >>>> + *new_inode = inode; >>>> + *dir_bh = parent_di_bh; >>>> + } else { >>>> + ocfs2_inode_unlock(dir, 1); >>>> + brelse(parent_di_bh); >>>> + } >>>> + >>>> return status; >>>> } >>>> >>>> @@ -2760,11 +2764,11 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb, >>>> } >>>> >>>> int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, >>>> + struct buffer_head *dir_bh, >>>> struct inode *inode, >>>> struct dentry *dentry) >>>> { >>>> int status = 0; >>>> - struct buffer_head *parent_di_bh = NULL; >>>> handle_t *handle = NULL; >>>> struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); >>>> struct ocfs2_dinode *dir_di, *di; >>>> @@ -2778,14 +2782,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, >>>> (unsigned long long)OCFS2_I(dir)->ip_blkno, >>>> (unsigned long long)OCFS2_I(inode)->ip_blkno); >>>> >>>> - status = ocfs2_inode_lock(dir, &parent_di_bh, 1); >>>> - if (status < 0) { >>>> - if (status != -ENOENT) >>>> - mlog_errno(status); >>>> - return status; >>>> - } >>>> - >>>> - dir_di = (struct ocfs2_dinode *) parent_di_bh->b_data; >>>> + dir_di = (struct ocfs2_dinode *) dir_bh->b_data; >>>> if (!dir_di->i_links_count) { >>>> /* can't make a file in a deleted directory. */ >>>> status = -ENOENT; >>>> @@ -2798,7 +2795,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, >>>> goto leave; >>>> >>>> /* get a spot inside the dir. */ >>>> - status = ocfs2_prepare_dir_for_insert(osb, dir, parent_di_bh, >>>> + status = ocfs2_prepare_dir_for_insert(osb, dir, dir_bh, >>>> dentry->d_name.name, >>>> dentry->d_name.len, &lookup); >>>> if (status < 0) { >>>> @@ -2862,7 +2859,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, >>>> ocfs2_journal_dirty(handle, di_bh); >>>> >>>> status = ocfs2_add_entry(handle, dentry, inode, >>>> - OCFS2_I(inode)->ip_blkno, parent_di_bh, >>>> + OCFS2_I(inode)->ip_blkno, dir_bh, >>>> &lookup); >>>> if (status < 0) { >>>> mlog_errno(status); >>>> @@ -2886,10 +2883,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, >>>> iput(orphan_dir_inode); >>>> leave: >>>> >>>> - ocfs2_inode_unlock(dir, 1); >>>> - >>>> brelse(di_bh); >>>> - brelse(parent_di_bh); >>>> brelse(orphan_dir_bh); >>>> >>>> ocfs2_free_dir_lookup_result(&lookup); >>>> diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h >>>> index 9cc891eb874e..03a2c526e2c1 100644 >>>> --- a/fs/ocfs2/namei.h >>>> +++ b/fs/ocfs2/namei.h >>>> @@ -24,6 +24,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, >>>> struct buffer_head *orphan_dir_bh, >>>> bool dio); >>>> int ocfs2_create_inode_in_orphan(struct inode *dir, >>>> + struct buffer_head **dir_bh, >>>> int mode, >>>> struct inode **new_inode); >>>> int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb, >>>> @@ -32,6 +33,7 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb, >>>> struct inode *inode, struct buffer_head *di_bh, >>>> int update_isize, loff_t end); >>>> int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, >>>> + struct buffer_head *dir_bh, >>>> struct inode *new_inode, >>>> struct dentry *new_dentry); >>>> >>>> diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c >>>> index 7f6355cbb587..a9a0c7c37e8e 100644 >>>> --- a/fs/ocfs2/refcounttree.c >>>> +++ b/fs/ocfs2/refcounttree.c >>>> @@ -4250,7 +4250,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, >>>> { >>>> int error, had_lock; >>>> struct inode *inode = d_inode(old_dentry); >>>> - struct buffer_head *old_bh = NULL; >>>> + struct buffer_head *old_bh = NULL, *dir_bh = NULL; >>>> struct inode *new_orphan_inode = NULL; >>>> struct ocfs2_lock_holder oh; >>>> >>>> @@ -4258,7 +4258,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, >>>> return -EOPNOTSUPP; >>>> >>>> >>>> - error = ocfs2_create_inode_in_orphan(dir, inode->i_mode, >>>> + error = ocfs2_create_inode_in_orphan(dir, &dir_bh, inode->i_mode, >>>> &new_orphan_inode); >>>> if (error) { >>>> mlog_errno(error); >>>> @@ -4304,13 +4304,15 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, >>>> >>>> /* If the security isn't preserved, we need to re-initialize them. */ >>>> if (!preserve) { >>>> - error = ocfs2_init_security_and_acl(dir, new_orphan_inode, >>>> + error = ocfs2_init_security_and_acl(dir, dir_bh, >>>> + new_orphan_inode, >>>> &new_dentry->d_name); >>>> if (error) >>>> mlog_errno(error); >>>> } >>>> if (!error) { >>>> - error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode, >>>> + error = ocfs2_mv_orphaned_inode_to_new(dir, dir_bh, >>>> + new_orphan_inode, >>>> new_dentry); >>>> if (error) >>>> mlog_errno(error); >>>> @@ -4328,6 +4330,11 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, >>>> iput(new_orphan_inode); >>>> } >>>> >>>> + if (dir_bh) { >>>> + ocfs2_inode_unlock(dir, 1); >>>> + brelse(dir_bh); >>>> + } >>>> + >>>> return error; >>>> } >>>> >>>> diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c >>>> index dd784eb0cd7c..3f23e3a5018c 100644 >>>> --- a/fs/ocfs2/xattr.c >>>> +++ b/fs/ocfs2/xattr.c >>>> @@ -7203,16 +7203,13 @@ int ocfs2_reflink_xattrs(struct inode *old_inode, >>>> /* >>>> * Initialize security and acl for a already created inode. >>>> * Used for reflink a non-preserve-security file. >>>> - * >>>> - * It uses common api like ocfs2_xattr_set, so the caller >>>> - * must not hold any lock expect i_mutex. >>>> */ >>>> int ocfs2_init_security_and_acl(struct inode *dir, >>>> + struct buffer_head *dir_bh, >>>> struct inode *inode, >>>> const struct qstr *qstr) >>>> { >>>> int ret = 0; >>>> - struct buffer_head *dir_bh = NULL; >>>> >>>> ret = ocfs2_init_security_get(inode, dir, qstr, NULL); >>>> if (ret) { >>>> @@ -7220,17 +7217,10 @@ int ocfs2_init_security_and_acl(struct inode *dir, >>>> goto leave; >>>> } >>>> >>>> - ret = ocfs2_inode_lock(dir, &dir_bh, 0); >>>> - if (ret) { >>>> - mlog_errno(ret); >>>> - goto leave; >>>> - } >>>> ret = ocfs2_init_acl(NULL, inode, dir, NULL, dir_bh, NULL, NULL); >>>> if (ret) >>>> mlog_errno(ret); >>>> >>>> - ocfs2_inode_unlock(dir, 0); >>>> - brelse(dir_bh); >>>> leave: >>>> return ret; >>>> } >>>> diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h >>>> index 00308b57f64f..b27fd8ba0019 100644 >>>> --- a/fs/ocfs2/xattr.h >>>> +++ b/fs/ocfs2/xattr.h >>>> @@ -83,6 +83,7 @@ int ocfs2_reflink_xattrs(struct inode *old_inode, >>>> struct buffer_head *new_bh, >>>> bool preserve_security); >>>> int ocfs2_init_security_and_acl(struct inode *dir, >>>> + struct buffer_head *dir_bh, >>>> struct inode *inode, >>>> const struct qstr *qstr); >>>> #endif /* OCFS2_XATTR_H */ >>>> >>> >>> >>> _______________________________________________ >>> Ocfs2-devel mailing list >>> Ocfs2-devel@oss.oracle.com >>> https://oss.oracle.com/mailman/listinfo/ocfs2-devel >
Hi Wengang, Sorry for delayed reply. On 2021/9/9 0:00, Wengang Wang wrote: > > >> On Sep 7, 2021, at 11:06 PM, Gang He <ghe@suse.com> wrote: >> >> >> >> On 2021/9/7 23:57, Wengang Wang wrote: >>> Hi Gang, >>> Sure, I will look into the problem you are trying to address. Any bug fix and performance improvement is welcomed! >>> Well, can you please provide the analysis on the tcpdumps between the (two) nodes that covers the reflink operation with/without your patch to show how you saved dlm locking ping-pongs? >> The code change will hold the reflink destination dir inode dlm lock, release it until the whole reflink process is completed. >> The current code will hold/release this dlm lock three times during the reflink process. If there are some concurrently reflink operation to that directory from other nodes, the ping-pong effect is that directory inode related data will be synchronized(multiple times) to the disk when inode dlm lock is downconverted during one reflink operation. >> > > I think the above can be a good summary, but not details or the locking ping-pong procedure. I need the details to understand the situation. For details, Which locks are involved, what are the locking types, which nodes are involved. The problem is clone files to the same directory simultaneously, for some users, they like to add this task to each node as a crontab job. The main impact factor is the destination dir inode dlm lock in this case, each reflink operation will acquire/release this dlm lock three time. For parallel environment, each node will contend this dlm lock, that means there will be some lock downconvert costs(sync the inode meta-data to the disk before downconvert). For the code logic, I think we can identify this problem clearly,I do not think we need performance related tool to help us. Here, I can share my test results for clone files to the same directory simultaneously. 1) test script on each node: https://pastebin.com/y8EcCjWK 2) Performance log of current parallel reflink: https://pastebin.com/qGSedy8E 3) Performance log of parallel reflink after patching: https://pastebin.com/xkyxtNU4 For the performance logs, the test time has been cut in half for every hundred reflink operations. In sum, I think this code change logic is simple and clear, can improve the performance in the above case. Of course, I also want more people to review if there is any risk after this patch, e.g. add dead-lock risk, etc. Thanks Gang > > Say like this (just for an example, not related to the problem here): > > Node 1 Node 2 Node 3 > ——————————— ——————————— ———————————————————— > dir1 meta lock EX > orphandir1 lock EX > dir1 meta lock EX > orphadir2 lock EX > > dir1 meta lock EX > orphadir3 lock EX > dir1 meta block EX (again) > ……………….. > > >> According to my test, running the reflink command to clone a file to the same directory repeatedly from three nodes, the code change can shorten the previous half of the time. >> > > I am not sure if above is a typical use case. > What else cases did you test? Did it help in case only one node do the reflink? > What if there are concurrent file creating/removing operations going on under the target directory when the reflink is going on? > > I think when you are trying to make a performance improvement, you should provide the performance data for different test cases, like this: > > ———————————————————————————————— > Test case desc | orig performance | performance after patched | > ———————————————————————————————— > test case1 (details) | perf data | perf data | > ———————————————————————————————- > test case2 (details) | perf data | perf data | > ——————————————————————————————— > …… > > thanks, > wengang > >> Thanks >> Gang >> >>> And what cases did you test to get better performance? >>> thanks, >>> wengang >>>> On Aug 30, 2021, at 11:25 PM, Gang He <ghe@suse.com> wrote: >>>> >>>> Hello Joseph and Wengang, >>>> >>>> When you have time, please help review this patch. >>>> About the deadlock problem which was caused by ocfs2_downconvert_lock >>>> failure, we have the fix patch, it is very key. >>>> But I feel this patch is still useful as a optimization patch, the user >>>> case is to reflink the files to the same directory concurrently, our >>>> users usually backup the files(via reflink) from the cluster nodes >>>> concurrently(via crontab) every day/hour. >>>> The current design, during the reflink process, the node will >>>> acquire/release dlm lock of the target directory multiple times, >>>> this is very inefficient in concurrently reflink. >>>> >>>> >>>> Thanks >>>> Gang >>>> >>>> On 2021/8/26 15:59, Gang He wrote: >>>>> During the reflink process, we should acquire the target directory >>>>> inode dlm lock at the beginning, and hold this dlm lock until end >>>>> of the function. >>>>> With this patch, we avoid dlm lock ping-pong effect when clone >>>>> files to the same directory simultaneously from multiple nodes. >>>>> There is a typical user scenario, users regularly back up files >>>>> to a specified directory through the reflink feature from the >>>>> multiple nodes. >>>>> >>>>> Signed-off-by: Gang He <ghe@suse.com> >>>>> --- >>>>> fs/ocfs2/namei.c | 32 +++++++++++++------------------- >>>>> fs/ocfs2/namei.h | 2 ++ >>>>> fs/ocfs2/refcounttree.c | 15 +++++++++++---- >>>>> fs/ocfs2/xattr.c | 12 +----------- >>>>> fs/ocfs2/xattr.h | 1 + >>>>> 5 files changed, 28 insertions(+), 34 deletions(-) >>>>> >>>>> diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c >>>>> index 2c46ff6ba4ea..f8bbb22cc60b 100644 >>>>> --- a/fs/ocfs2/namei.c >>>>> +++ b/fs/ocfs2/namei.c >>>>> @@ -2489,6 +2489,7 @@ static int ocfs2_prep_new_orphaned_file(struct inode *dir, >>>>> } >>>>> >>>>> int ocfs2_create_inode_in_orphan(struct inode *dir, >>>>> + struct buffer_head **dir_bh, >>>>> int mode, >>>>> struct inode **new_inode) >>>>> { >>>>> @@ -2597,13 +2598,16 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, >>>>> >>>>> brelse(new_di_bh); >>>>> >>>>> - if (!status) >>>>> - *new_inode = inode; >>>>> - >>>>> ocfs2_free_dir_lookup_result(&orphan_insert); >>>>> >>>>> - ocfs2_inode_unlock(dir, 1); >>>>> - brelse(parent_di_bh); >>>>> + if (!status) { >>>>> + *new_inode = inode; >>>>> + *dir_bh = parent_di_bh; >>>>> + } else { >>>>> + ocfs2_inode_unlock(dir, 1); >>>>> + brelse(parent_di_bh); >>>>> + } >>>>> + >>>>> return status; >>>>> } >>>>> >>>>> @@ -2760,11 +2764,11 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb, >>>>> } >>>>> >>>>> int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, >>>>> + struct buffer_head *dir_bh, >>>>> struct inode *inode, >>>>> struct dentry *dentry) >>>>> { >>>>> int status = 0; >>>>> - struct buffer_head *parent_di_bh = NULL; >>>>> handle_t *handle = NULL; >>>>> struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); >>>>> struct ocfs2_dinode *dir_di, *di; >>>>> @@ -2778,14 +2782,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, >>>>> (unsigned long long)OCFS2_I(dir)->ip_blkno, >>>>> (unsigned long long)OCFS2_I(inode)->ip_blkno); >>>>> >>>>> - status = ocfs2_inode_lock(dir, &parent_di_bh, 1); >>>>> - if (status < 0) { >>>>> - if (status != -ENOENT) >>>>> - mlog_errno(status); >>>>> - return status; >>>>> - } >>>>> - >>>>> - dir_di = (struct ocfs2_dinode *) parent_di_bh->b_data; >>>>> + dir_di = (struct ocfs2_dinode *) dir_bh->b_data; >>>>> if (!dir_di->i_links_count) { >>>>> /* can't make a file in a deleted directory. */ >>>>> status = -ENOENT; >>>>> @@ -2798,7 +2795,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, >>>>> goto leave; >>>>> >>>>> /* get a spot inside the dir. */ >>>>> - status = ocfs2_prepare_dir_for_insert(osb, dir, parent_di_bh, >>>>> + status = ocfs2_prepare_dir_for_insert(osb, dir, dir_bh, >>>>> dentry->d_name.name, >>>>> dentry->d_name.len, &lookup); >>>>> if (status < 0) { >>>>> @@ -2862,7 +2859,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, >>>>> ocfs2_journal_dirty(handle, di_bh); >>>>> >>>>> status = ocfs2_add_entry(handle, dentry, inode, >>>>> - OCFS2_I(inode)->ip_blkno, parent_di_bh, >>>>> + OCFS2_I(inode)->ip_blkno, dir_bh, >>>>> &lookup); >>>>> if (status < 0) { >>>>> mlog_errno(status); >>>>> @@ -2886,10 +2883,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, >>>>> iput(orphan_dir_inode); >>>>> leave: >>>>> >>>>> - ocfs2_inode_unlock(dir, 1); >>>>> - >>>>> brelse(di_bh); >>>>> - brelse(parent_di_bh); >>>>> brelse(orphan_dir_bh); >>>>> >>>>> ocfs2_free_dir_lookup_result(&lookup); >>>>> diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h >>>>> index 9cc891eb874e..03a2c526e2c1 100644 >>>>> --- a/fs/ocfs2/namei.h >>>>> +++ b/fs/ocfs2/namei.h >>>>> @@ -24,6 +24,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, >>>>> struct buffer_head *orphan_dir_bh, >>>>> bool dio); >>>>> int ocfs2_create_inode_in_orphan(struct inode *dir, >>>>> + struct buffer_head **dir_bh, >>>>> int mode, >>>>> struct inode **new_inode); >>>>> int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb, >>>>> @@ -32,6 +33,7 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb, >>>>> struct inode *inode, struct buffer_head *di_bh, >>>>> int update_isize, loff_t end); >>>>> int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, >>>>> + struct buffer_head *dir_bh, >>>>> struct inode *new_inode, >>>>> struct dentry *new_dentry); >>>>> >>>>> diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c >>>>> index 7f6355cbb587..a9a0c7c37e8e 100644 >>>>> --- a/fs/ocfs2/refcounttree.c >>>>> +++ b/fs/ocfs2/refcounttree.c >>>>> @@ -4250,7 +4250,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, >>>>> { >>>>> int error, had_lock; >>>>> struct inode *inode = d_inode(old_dentry); >>>>> - struct buffer_head *old_bh = NULL; >>>>> + struct buffer_head *old_bh = NULL, *dir_bh = NULL; >>>>> struct inode *new_orphan_inode = NULL; >>>>> struct ocfs2_lock_holder oh; >>>>> >>>>> @@ -4258,7 +4258,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, >>>>> return -EOPNOTSUPP; >>>>> >>>>> >>>>> - error = ocfs2_create_inode_in_orphan(dir, inode->i_mode, >>>>> + error = ocfs2_create_inode_in_orphan(dir, &dir_bh, inode->i_mode, >>>>> &new_orphan_inode); >>>>> if (error) { >>>>> mlog_errno(error); >>>>> @@ -4304,13 +4304,15 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, >>>>> >>>>> /* If the security isn't preserved, we need to re-initialize them. */ >>>>> if (!preserve) { >>>>> - error = ocfs2_init_security_and_acl(dir, new_orphan_inode, >>>>> + error = ocfs2_init_security_and_acl(dir, dir_bh, >>>>> + new_orphan_inode, >>>>> &new_dentry->d_name); >>>>> if (error) >>>>> mlog_errno(error); >>>>> } >>>>> if (!error) { >>>>> - error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode, >>>>> + error = ocfs2_mv_orphaned_inode_to_new(dir, dir_bh, >>>>> + new_orphan_inode, >>>>> new_dentry); >>>>> if (error) >>>>> mlog_errno(error); >>>>> @@ -4328,6 +4330,11 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, >>>>> iput(new_orphan_inode); >>>>> } >>>>> >>>>> + if (dir_bh) { >>>>> + ocfs2_inode_unlock(dir, 1); >>>>> + brelse(dir_bh); >>>>> + } >>>>> + >>>>> return error; >>>>> } >>>>> >>>>> diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c >>>>> index dd784eb0cd7c..3f23e3a5018c 100644 >>>>> --- a/fs/ocfs2/xattr.c >>>>> +++ b/fs/ocfs2/xattr.c >>>>> @@ -7203,16 +7203,13 @@ int ocfs2_reflink_xattrs(struct inode *old_inode, >>>>> /* >>>>> * Initialize security and acl for a already created inode. >>>>> * Used for reflink a non-preserve-security file. >>>>> - * >>>>> - * It uses common api like ocfs2_xattr_set, so the caller >>>>> - * must not hold any lock expect i_mutex. >>>>> */ >>>>> int ocfs2_init_security_and_acl(struct inode *dir, >>>>> + struct buffer_head *dir_bh, >>>>> struct inode *inode, >>>>> const struct qstr *qstr) >>>>> { >>>>> int ret = 0; >>>>> - struct buffer_head *dir_bh = NULL; >>>>> >>>>> ret = ocfs2_init_security_get(inode, dir, qstr, NULL); >>>>> if (ret) { >>>>> @@ -7220,17 +7217,10 @@ int ocfs2_init_security_and_acl(struct inode *dir, >>>>> goto leave; >>>>> } >>>>> >>>>> - ret = ocfs2_inode_lock(dir, &dir_bh, 0); >>>>> - if (ret) { >>>>> - mlog_errno(ret); >>>>> - goto leave; >>>>> - } >>>>> ret = ocfs2_init_acl(NULL, inode, dir, NULL, dir_bh, NULL, NULL); >>>>> if (ret) >>>>> mlog_errno(ret); >>>>> >>>>> - ocfs2_inode_unlock(dir, 0); >>>>> - brelse(dir_bh); >>>>> leave: >>>>> return ret; >>>>> } >>>>> diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h >>>>> index 00308b57f64f..b27fd8ba0019 100644 >>>>> --- a/fs/ocfs2/xattr.h >>>>> +++ b/fs/ocfs2/xattr.h >>>>> @@ -83,6 +83,7 @@ int ocfs2_reflink_xattrs(struct inode *old_inode, >>>>> struct buffer_head *new_bh, >>>>> bool preserve_security); >>>>> int ocfs2_init_security_and_acl(struct inode *dir, >>>>> + struct buffer_head *dir_bh, >>>>> struct inode *inode, >>>>> const struct qstr *qstr); >>>>> #endif /* OCFS2_XATTR_H */ >>>>> >>>> >>>> >>>> _______________________________________________ >>>> Ocfs2-devel mailing list >>>> Ocfs2-devel@oss.oracle.com >>>> https://oss.oracle.com/mailman/listinfo/ocfs2-devel >> >
On Sep 14, 2021, at 12:34 AM, Gang He <ghe@suse.com<mailto:ghe@suse.com>> wrote:
Hi Wengang,
Sorry for delayed reply.
On 2021/9/9 0:00, Wengang Wang wrote:
On Sep 7, 2021, at 11:06 PM, Gang He <ghe@suse.com<mailto:ghe@suse.com>> wrote:
On 2021/9/7 23:57, Wengang Wang wrote:
Hi Gang,
Sure, I will look into the problem you are trying to address. Any bug fix and performance improvement is welcomed!
Well, can you please provide the analysis on the tcpdumps between the (two) nodes that covers the reflink operation with/without your patch to show how you saved dlm locking ping-pongs?
The code change will hold the reflink destination dir inode dlm lock, release it until the whole reflink process is completed.
The current code will hold/release this dlm lock three times during the reflink process. If there are some concurrently reflink operation to that directory from other nodes, the ping-pong effect is that directory inode related data will be synchronized(multiple times) to the disk when inode dlm lock is downconverted during one reflink operation.
I think the above can be a good summary, but not details or the locking ping-pong procedure. I need the details to understand the situation. For details, Which locks are involved, what are the locking types, which nodes are involved.
The problem is clone files to the same directory simultaneously, for some users, they like to add this task to each node as a crontab job.
The main impact factor is the destination dir inode dlm lock in this case, each reflink operation will acquire/release this dlm lock three time. For parallel environment, each node will contend this dlm lock, that means there will be some lock downconvert costs(sync the inode meta-data to the disk before downconvert).
For the code logic, I think we can identify this problem clearly,I
do not think we need performance related tool to help us.
Here, I can share my test results for clone files to the same directory simultaneously.
1) test script on each node: https://pastebin.com/y8EcCjWK
I am pasting your test script here:
1 loop=1
2 while ((loop++)) ; do
3
4 for i in `seq 1 100`; do
5 reflink "/mnt/shared/testnode1.qcow2" "/mnt/shared/.snapshots/testnode1.qcow2.${loop}.${i}.`date +%m%d%H%M%S`.`hostname`"
6 done
7
8 rm -f /mnt/shared/.snapshots/testnode1.qcow2.*.`hostname`
9 echo "`hostname` `date +%m%d%H%M%S`: loop - $loop"
10 done
I think your test script would heavily depends on the size of file testnode1.qcow2 and how fragmented it is. That would impact the number of extent records to be copied.
Let’s simplify the reflink procedure with your patch applied:
1) lock destination dir
2) extent record copy
3) unlock destination dir
Apply this to multinode case:
node1 node2 node3
———— ———— ————
lock
extent copy
unlock
lock
extent copy
unlock
lock
extent copy
unlock
As you said, you saved the time of down-converting by serializing the reflink opertions targeting the directory. But seems you skipped the impact of extent copy.
If extent copy take much more time than lock down-convertings, your patch may slowdown the whole reflink operations among the three nodes.
The time for extent copy depends:
a) number of extents to copy. more extents more time
b) CPUs load on the nodes, higher CPU load more time
c) OCFS2 device load, higher load more time
I don’t know if you considered above factors when you did your test. I was ever playing reflink with well fragmented 50GB file (though it’s not OCFS2). When load is added, a single run of reflink take 20+m.
So I’d like to see your test for at least this case:
I) make the reflink source file 50GiB long (with FS cluster size 1MiB) and
II) make the reflink source file well fragmented (suggest 40% fragmentation) and
III) add some CPU and block device load and
IV) add some file creation/removal operations under the same destination directory as the reflinks do, and collect the time for each operations. Do it on all the three nodes.
For II), I’d suggest 40% fragmentation ratio, some example output and time for reflinks
debugfs: frag /file1
Inode: 592130 % fragmented: 40.00 clusters: 51200 extents: 20481 score: 40
[root@wengwan-ocfs2-3 opc]# time reflink /ocfs2/file1 /ocfs2/dir1/reflink1
real 0m6.837s
user 0m0.000s
sys 0m6.828s
[root@wengwan-ocfs2-3 opc]# time reflink /ocfs2/file1 /ocfs2/dir1/reflink2
real 0m3.799s
user 0m0.000s
sys 0m3.799s
[root@wengwan-ocfs2-3 opc]# time reflink /ocfs2/file1 /ocfs2/dir1/reflink3
real 0m3.802s
user 0m0.001s
sys 0m3.801s
I’d guess you may get negative result comparing to original kernel for both reflink and file creation/removal.
thanks,
wengang
2) Performance log of current parallel reflink:
https://pastebin.com/qGSedy8E
3) Performance log of parallel reflink after patching:
https://pastebin.com/xkyxtNU4
For the performance logs, the test time has been cut in half for every hundred reflink operations.
In sum, I think this code change logic is simple and clear, can improve
the performance in the above case. Of course, I also want more people to review if there is any risk after this patch, e.g. add dead-lock risk, etc.
Thanks
Gang
Say like this (just for an example, not related to the problem here):
Node 1 Node 2 Node 3
——————————— ——————————— ————————————————————
dir1 meta lock EX
orphandir1 lock EX
dir1 meta lock EX
orphadir2 lock EX
dir1 meta lock EX
orphadir3 lock EX
dir1 meta block EX (again)
………………..
According to my test, running the reflink command to clone a file to the same directory repeatedly from three nodes, the code change can shorten the previous half of the time.
I am not sure if above is a typical use case.
What else cases did you test? Did it help in case only one node do the reflink?
What if there are concurrent file creating/removing operations going on under the target directory when the reflink is going on?
I think when you are trying to make a performance improvement, you should provide the performance data for different test cases, like this:
————————————————————————————————
Test case desc | orig performance | performance after patched |
————————————————————————————————
test case1 (details) | perf data | perf data |
———————————————————————————————-
test case2 (details) | perf data | perf data |
———————————————————————————————
……
thanks,
wengang
Thanks
Gang
And what cases did you test to get better performance?
thanks,
wengang
On Aug 30, 2021, at 11:25 PM, Gang He <ghe@suse.com<mailto:ghe@suse.com>> wrote:
Hello Joseph and Wengang,
When you have time, please help review this patch.
About the deadlock problem which was caused by ocfs2_downconvert_lock
failure, we have the fix patch, it is very key.
But I feel this patch is still useful as a optimization patch, the user
case is to reflink the files to the same directory concurrently, our
users usually backup the files(via reflink) from the cluster nodes
concurrently(via crontab) every day/hour.
The current design, during the reflink process, the node will
acquire/release dlm lock of the target directory multiple times,
this is very inefficient in concurrently reflink.
Thanks
Gang
On 2021/8/26 15:59, Gang He wrote:
During the reflink process, we should acquire the target directory
inode dlm lock at the beginning, and hold this dlm lock until end
of the function.
With this patch, we avoid dlm lock ping-pong effect when clone
files to the same directory simultaneously from multiple nodes.
There is a typical user scenario, users regularly back up files
to a specified directory through the reflink feature from the
multiple nodes.
Signed-off-by: Gang He <ghe@suse.com<mailto:ghe@suse.com>>
---
fs/ocfs2/namei.c | 32 +++++++++++++-------------------
fs/ocfs2/namei.h | 2 ++
fs/ocfs2/refcounttree.c | 15 +++++++++++----
fs/ocfs2/xattr.c | 12 +-----------
fs/ocfs2/xattr.h | 1 +
5 files changed, 28 insertions(+), 34 deletions(-)
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 2c46ff6ba4ea..f8bbb22cc60b 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -2489,6 +2489,7 @@ static int ocfs2_prep_new_orphaned_file(struct inode *dir,
}
int ocfs2_create_inode_in_orphan(struct inode *dir,
+ struct buffer_head **dir_bh,
int mode,
struct inode **new_inode)
{
@@ -2597,13 +2598,16 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
brelse(new_di_bh);
- if (!status)
- *new_inode = inode;
-
ocfs2_free_dir_lookup_result(&orphan_insert);
- ocfs2_inode_unlock(dir, 1);
- brelse(parent_di_bh);
+ if (!status) {
+ *new_inode = inode;
+ *dir_bh = parent_di_bh;
+ } else {
+ ocfs2_inode_unlock(dir, 1);
+ brelse(parent_di_bh);
+ }
+
return status;
}
@@ -2760,11 +2764,11 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb,
}
int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
+ struct buffer_head *dir_bh,
struct inode *inode,
struct dentry *dentry)
{
int status = 0;
- struct buffer_head *parent_di_bh = NULL;
handle_t *handle = NULL;
struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
struct ocfs2_dinode *dir_di, *di;
@@ -2778,14 +2782,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
(unsigned long long)OCFS2_I(dir)->ip_blkno,
(unsigned long long)OCFS2_I(inode)->ip_blkno);
- status = ocfs2_inode_lock(dir, &parent_di_bh, 1);
- if (status < 0) {
- if (status != -ENOENT)
- mlog_errno(status);
- return status;
- }
-
- dir_di = (struct ocfs2_dinode *) parent_di_bh->b_data;
+ dir_di = (struct ocfs2_dinode *) dir_bh->b_data;
if (!dir_di->i_links_count) {
/* can't make a file in a deleted directory. */
status = -ENOENT;
@@ -2798,7 +2795,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
goto leave;
/* get a spot inside the dir. */
- status = ocfs2_prepare_dir_for_insert(osb, dir, parent_di_bh,
+ status = ocfs2_prepare_dir_for_insert(osb, dir, dir_bh,
dentry->d_name.name,
dentry->d_name.len, &lookup);
if (status < 0) {
@@ -2862,7 +2859,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
ocfs2_journal_dirty(handle, di_bh);
status = ocfs2_add_entry(handle, dentry, inode,
- OCFS2_I(inode)->ip_blkno, parent_di_bh,
+ OCFS2_I(inode)->ip_blkno, dir_bh,
&lookup);
if (status < 0) {
mlog_errno(status);
@@ -2886,10 +2883,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
iput(orphan_dir_inode);
leave:
- ocfs2_inode_unlock(dir, 1);
-
brelse(di_bh);
- brelse(parent_di_bh);
brelse(orphan_dir_bh);
ocfs2_free_dir_lookup_result(&lookup);
diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h
index 9cc891eb874e..03a2c526e2c1 100644
--- a/fs/ocfs2/namei.h
+++ b/fs/ocfs2/namei.h
@@ -24,6 +24,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
struct buffer_head *orphan_dir_bh,
bool dio);
int ocfs2_create_inode_in_orphan(struct inode *dir,
+ struct buffer_head **dir_bh,
int mode,
struct inode **new_inode);
int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb,
@@ -32,6 +33,7 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb,
struct inode *inode, struct buffer_head *di_bh,
int update_isize, loff_t end);
int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
+ struct buffer_head *dir_bh,
struct inode *new_inode,
struct dentry *new_dentry);
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 7f6355cbb587..a9a0c7c37e8e 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4250,7 +4250,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
{
int error, had_lock;
struct inode *inode = d_inode(old_dentry);
- struct buffer_head *old_bh = NULL;
+ struct buffer_head *old_bh = NULL, *dir_bh = NULL;
struct inode *new_orphan_inode = NULL;
struct ocfs2_lock_holder oh;
@@ -4258,7 +4258,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
return -EOPNOTSUPP;
- error = ocfs2_create_inode_in_orphan(dir, inode->i_mode,
+ error = ocfs2_create_inode_in_orphan(dir, &dir_bh, inode->i_mode,
&new_orphan_inode);
if (error) {
mlog_errno(error);
@@ -4304,13 +4304,15 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
/* If the security isn't preserved, we need to re-initialize them. */
if (!preserve) {
- error = ocfs2_init_security_and_acl(dir, new_orphan_inode,
+ error = ocfs2_init_security_and_acl(dir, dir_bh,
+ new_orphan_inode,
&new_dentry->d_name);
if (error)
mlog_errno(error);
}
if (!error) {
- error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode,
+ error = ocfs2_mv_orphaned_inode_to_new(dir, dir_bh,
+ new_orphan_inode,
new_dentry);
if (error)
mlog_errno(error);
@@ -4328,6 +4330,11 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
iput(new_orphan_inode);
}
+ if (dir_bh) {
+ ocfs2_inode_unlock(dir, 1);
+ brelse(dir_bh);
+ }
+
return error;
}
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index dd784eb0cd7c..3f23e3a5018c 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -7203,16 +7203,13 @@ int ocfs2_reflink_xattrs(struct inode *old_inode,
/*
* Initialize security and acl for a already created inode.
* Used for reflink a non-preserve-security file.
- *
- * It uses common api like ocfs2_xattr_set, so the caller
- * must not hold any lock expect i_mutex.
*/
int ocfs2_init_security_and_acl(struct inode *dir,
+ struct buffer_head *dir_bh,
struct inode *inode,
const struct qstr *qstr)
{
int ret = 0;
- struct buffer_head *dir_bh = NULL;
ret = ocfs2_init_security_get(inode, dir, qstr, NULL);
if (ret) {
@@ -7220,17 +7217,10 @@ int ocfs2_init_security_and_acl(struct inode *dir,
goto leave;
}
- ret = ocfs2_inode_lock(dir, &dir_bh, 0);
- if (ret) {
- mlog_errno(ret);
- goto leave;
- }
ret = ocfs2_init_acl(NULL, inode, dir, NULL, dir_bh, NULL, NULL);
if (ret)
mlog_errno(ret);
- ocfs2_inode_unlock(dir, 0);
- brelse(dir_bh);
leave:
return ret;
}
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index 00308b57f64f..b27fd8ba0019 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -83,6 +83,7 @@ int ocfs2_reflink_xattrs(struct inode *old_inode,
struct buffer_head *new_bh,
bool preserve_security);
int ocfs2_init_security_and_acl(struct inode *dir,
+ struct buffer_head *dir_bh,
struct inode *inode,
const struct qstr *qstr);
#endif /* OCFS2_XATTR_H */
Hi Wengang and All, On 2021/9/15 1:50, Wengang Wang wrote: > > >> On Sep 14, 2021, at 12:34 AM, Gang He <ghe@suse.com >> <mailto:ghe@suse.com>> wrote: >> >> Hi Wengang, >> >> Sorry for delayed reply. >> >> On 2021/9/9 0:00, Wengang Wang wrote: >>>> On Sep 7, 2021, at 11:06 PM, Gang He <ghe@suse.com >>>> <mailto:ghe@suse.com>> wrote: >>>> >>>> >>>> >>>> On 2021/9/7 23:57, Wengang Wang wrote: >>>>> Hi Gang, >>>>> Sure, I will look into the problem you are trying to address. Any >>>>> bug fix and performance improvement is welcomed! >>>>> Well, can you please provide the analysis on the tcpdumps between >>>>> the (two) nodes that covers the reflink operation with/without your >>>>> patch to show how you saved dlm locking ping-pongs? >>>> The code change will hold the reflink destination dir inode dlm >>>> lock, release it until the whole reflink process is completed. >>>> The current code will hold/release this dlm lock three times during >>>> the reflink process. If there are some concurrently reflink >>>> operation to that directory from other nodes, the ping-pong effect >>>> is that directory inode related data will be synchronized(multiple >>>> times) to the disk when inode dlm lock is downconverted during one >>>> reflink operation. >>>> >>> I think the above can be a good summary, but not details or the >>> locking ping-pong procedure. I need the details to understand the >>> situation. For details, Which locks are involved, what are the >>> locking types, which nodes are involved. >> >> The problem is clone files to the same directory simultaneously, for >> some users, they like to add this task to each node as a crontab job. >> The main impact factor is the destination dir inode dlm lock in this >> case, each reflink operation will acquire/release this dlm lock three >> time. For parallel environment, each node will contend this dlm lock, >> that means there will be some lock downconvert costs(sync the inode >> meta-data to the disk before downconvert). >> For the code logic, I think we can identify this problem clearly,I >> do not think we need performance related tool to help us. >> Here, I can share my test results for clone files to the same >> directory simultaneously. >> 1) test script on each node: https://pastebin.com/y8EcCjWK >> <https://pastebin.com/y8EcCjWK> > > I am pasting your test script here: > > 1 loop=1 > 2 while ((loop++)) ; do > 3 > 4 for i in `seq 1 100`; do > 5 reflink "/mnt/shared/testnode1.qcow2" > "/mnt/shared/.snapshots/testnode1.qcow2.${loop}.${i}.`date > +%m%d%H%M%S`.`hostname`" > 6 done > 7 > 8 rm -f /mnt/shared/.snapshots/testnode1.qcow2.*.`hostname` > 9 echo "`hostname` `date +%m%d%H%M%S`: loop - $loop" > 10 done > > I think your test script would heavily depends on the size of file > testnode1.qcow2 and how fragmented it is. That would impact the number > of extent records to be copied. > Let’s simplify the reflink procedure with your patch applied: > > 1) lock destination dir > 2) extent record copy > 3) unlock destination dir > > Apply this to multinode case: > > node1 node2 node3 > ———— ———— ———— > lock > extent copy > unlock > > lock > extent copy > unlock > > lock > extent copy > unlock > > > As you said, you saved the time of down-converting by serializing the > reflink opertions targeting the directory. But seems you skipped the > impact of extent copy. > If extent copy take much more time than lock down-convertings, your > patch may slowdown the whole reflink operations among the three nodes. > The time for extent copy depends: > a) number of extents to copy. more extents more time > b) CPUs load on the nodes, higher CPU load more time > c) OCFS2 device load, higher load more time > > I don’t know if you considered above factors when you did your test. I > was ever playing reflink with well fragmented 50GB file (though it’s not > OCFS2). When load is added, a single run of reflink take 20+m. > So I’d like to see your test for at least this case: > I) make the reflink source file 50GiB long (with FS cluster size 1MiB) and > II) make the reflink source file well fragmented (suggest 40% > fragmentation) and > III) add some CPU and block device load and > IV) add some file creation/removal operations under the same destination > directory as the reflinks do, and collect the time for each operations. > Do it on all the three nodes. > > For II), I’d suggest 40% fragmentation ratio, some example output and > time for reflinks > debugfs: frag /file1 > Inode: 592130% fragmented: 40.00clusters: 51200extents: 20481score: 40 > > [root@wengwan-ocfs2-3 opc]# time reflink /ocfs2/file1 /ocfs2/dir1/reflink1 > > real0m6.837s > user0m0.000s > sys0m6.828s > [root@wengwan-ocfs2-3 opc]# time reflink /ocfs2/file1 /ocfs2/dir1/reflink2 > > real0m3.799s > user0m0.000s > sys0m3.799s > [root@wengwan-ocfs2-3 opc]# time reflink /ocfs2/file1 /ocfs2/dir1/reflink3 > > real0m3.802s > user0m0.001s > sys0m3.801s > > I’d guess you may get negative result comparing to original kernel for > both reflink and file creation/removal. I create the similar files(include lots of meta-data blocks), and clone them to the same directory, Based on my testing, it looks the performance is not be improved between the current code and the patched code. So, I'd like to revoke this patch. Thanks a lot. Gang > > thanks, > wengang > >> 2) Performance log of current parallel reflink: >> https://pastebin.com/qGSedy8E <https://pastebin.com/qGSedy8E> >> 3) Performance log of parallel reflink after patching: >> https://pastebin.com/xkyxtNU4 <https://pastebin.com/xkyxtNU4> >> >> For the performance logs, the test time has been cut in half for every >> hundred reflink operations. >> >> In sum, I think this code change logic is simple and clear, can improve >> the performance in the above case. Of course, I also want more people >> to review if there is any risk after this patch, e.g. add dead-lock >> risk, etc. >> >> >> >> Thanks >> Gang >> >> >>> Say like this (just for an example, not related to the problem here): >>> Node 1 Node 2 >>> Node 3 >>> ——————————— ——————————— ———————————————————— >>> dir1 meta lock EX >>> orphandir1 lock EX >>> dir1 meta lock EX >>> orphadir2 lock EX >>> dir1 meta lock EX >>> orphadir3 >>> lock EX >>> dir1 meta block EX (again) >>> ……………….. >>>> According to my test, running the reflink command to clone a file to >>>> the same directory repeatedly from three nodes, the code change can >>>> shorten the previous half of the time. >>>> >>> I am not sure if above is a typical use case. >>> What else cases did you test? Did it help in case only one node do >>> the reflink? >>> What if there are concurrent file creating/removing operations going >>> on under the target directory when the reflink is going on? >>> I think when you are trying to make a performance improvement, you >>> should provide the performance data for different test cases, like this: >>> ———————————————————————————————— >>> Test case desc | orig performance | performance after >>> patched | >>> ———————————————————————————————— >>> test case1 (details) | perf data | perf data >>> | >>> ———————————————————————————————- >>> test case2 (details) | perf data | perf data >>> | >>> ——————————————————————————————— >>> …… >>> thanks, >>> wengang >>>> Thanks >>>> Gang >>>> >>>>> And what cases did you test to get better performance? >>>>> thanks, >>>>> wengang >>>>>> On Aug 30, 2021, at 11:25 PM, Gang He <ghe@suse.com >>>>>> <mailto:ghe@suse.com>> wrote: >>>>>> >>>>>> Hello Joseph and Wengang, >>>>>> >>>>>> When you have time, please help review this patch. >>>>>> About the deadlock problem which was caused by ocfs2_downconvert_lock >>>>>> failure, we have the fix patch, it is very key. >>>>>> But I feel this patch is still useful as a optimization patch, the >>>>>> user >>>>>> case is to reflink the files to the same directory concurrently, our >>>>>> users usually backup the files(via reflink) from the cluster nodes >>>>>> concurrently(via crontab) every day/hour. >>>>>> The current design, during the reflink process, the node will >>>>>> acquire/release dlm lock of the target directory multiple times, >>>>>> this is very inefficient in concurrently reflink. >>>>>> >>>>>> >>>>>> Thanks >>>>>> Gang >>>>>> >>>>>> On 2021/8/26 15:59, Gang He wrote: >>>>>>> During the reflink process, we should acquire the target directory >>>>>>> inode dlm lock at the beginning, and hold this dlm lock until end >>>>>>> of the function. >>>>>>> With this patch, we avoid dlm lock ping-pong effect when clone >>>>>>> files to the same directory simultaneously from multiple nodes. >>>>>>> There is a typical user scenario, users regularly back up files >>>>>>> to a specified directory through the reflink feature from the >>>>>>> multiple nodes. >>>>>>> >>>>>>> Signed-off-by: Gang He <ghe@suse.com <mailto:ghe@suse.com>> >>>>>>> --- >>>>>>> fs/ocfs2/namei.c | 32 +++++++++++++------------------- >>>>>>> fs/ocfs2/namei.h | 2 ++ >>>>>>> fs/ocfs2/refcounttree.c | 15 +++++++++++---- >>>>>>> fs/ocfs2/xattr.c | 12 +----------- >>>>>>> fs/ocfs2/xattr.h | 1 + >>>>>>> 5 files changed, 28 insertions(+), 34 deletions(-) >>>>>>> >>>>>>> diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c >>>>>>> index 2c46ff6ba4ea..f8bbb22cc60b 100644 >>>>>>> --- a/fs/ocfs2/namei.c >>>>>>> +++ b/fs/ocfs2/namei.c >>>>>>> @@ -2489,6 +2489,7 @@ static int >>>>>>> ocfs2_prep_new_orphaned_file(struct inode *dir, >>>>>>> } >>>>>>> >>>>>>> int ocfs2_create_inode_in_orphan(struct inode *dir, >>>>>>> +struct buffer_head **dir_bh, >>>>>>> int mode, >>>>>>> struct inode **new_inode) >>>>>>> { >>>>>>> @@ -2597,13 +2598,16 @@ int ocfs2_create_inode_in_orphan(struct >>>>>>> inode *dir, >>>>>>> >>>>>>> brelse(new_di_bh); >>>>>>> >>>>>>> -if (!status) >>>>>>> -*new_inode = inode; >>>>>>> - >>>>>>> ocfs2_free_dir_lookup_result(&orphan_insert); >>>>>>> >>>>>>> -ocfs2_inode_unlock(dir, 1); >>>>>>> -brelse(parent_di_bh); >>>>>>> +if (!status) { >>>>>>> +*new_inode = inode; >>>>>>> +*dir_bh = parent_di_bh; >>>>>>> +} else { >>>>>>> +ocfs2_inode_unlock(dir, 1); >>>>>>> +brelse(parent_di_bh); >>>>>>> +} >>>>>>> + >>>>>>> return status; >>>>>>> } >>>>>>> >>>>>>> @@ -2760,11 +2764,11 @@ int ocfs2_del_inode_from_orphan(struct >>>>>>> ocfs2_super *osb, >>>>>>> } >>>>>>> >>>>>>> int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, >>>>>>> + struct buffer_head *dir_bh, >>>>>>> struct inode *inode, >>>>>>> struct dentry *dentry) >>>>>>> { >>>>>>> int status = 0; >>>>>>> -struct buffer_head *parent_di_bh = NULL; >>>>>>> handle_t *handle = NULL; >>>>>>> struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); >>>>>>> struct ocfs2_dinode *dir_di, *di; >>>>>>> @@ -2778,14 +2782,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct >>>>>>> inode *dir, >>>>>>> (unsigned long long)OCFS2_I(dir)->ip_blkno, >>>>>>> (unsigned long long)OCFS2_I(inode)->ip_blkno); >>>>>>> >>>>>>> -status = ocfs2_inode_lock(dir, &parent_di_bh, 1); >>>>>>> -if (status < 0) { >>>>>>> -if (status != -ENOENT) >>>>>>> -mlog_errno(status); >>>>>>> -return status; >>>>>>> -} >>>>>>> - >>>>>>> -dir_di = (struct ocfs2_dinode *) parent_di_bh->b_data; >>>>>>> +dir_di = (struct ocfs2_dinode *) dir_bh->b_data; >>>>>>> if (!dir_di->i_links_count) { >>>>>>> /* can't make a file in a deleted directory. */ >>>>>>> status = -ENOENT; >>>>>>> @@ -2798,7 +2795,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct >>>>>>> inode *dir, >>>>>>> goto leave; >>>>>>> >>>>>>> /* get a spot inside the dir. */ >>>>>>> -status = ocfs2_prepare_dir_for_insert(osb, dir, parent_di_bh, >>>>>>> +status = ocfs2_prepare_dir_for_insert(osb, dir, dir_bh, >>>>>>> dentry->d_name.name, >>>>>>> dentry->d_name.len, &lookup); >>>>>>> if (status < 0) { >>>>>>> @@ -2862,7 +2859,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct >>>>>>> inode *dir, >>>>>>> ocfs2_journal_dirty(handle, di_bh); >>>>>>> >>>>>>> status = ocfs2_add_entry(handle, dentry, inode, >>>>>>> -OCFS2_I(inode)->ip_blkno, parent_di_bh, >>>>>>> +OCFS2_I(inode)->ip_blkno, dir_bh, >>>>>>> &lookup); >>>>>>> if (status < 0) { >>>>>>> mlog_errno(status); >>>>>>> @@ -2886,10 +2883,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct >>>>>>> inode *dir, >>>>>>> iput(orphan_dir_inode); >>>>>>> leave: >>>>>>> >>>>>>> -ocfs2_inode_unlock(dir, 1); >>>>>>> - >>>>>>> brelse(di_bh); >>>>>>> -brelse(parent_di_bh); >>>>>>> brelse(orphan_dir_bh); >>>>>>> >>>>>>> ocfs2_free_dir_lookup_result(&lookup); >>>>>>> diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h >>>>>>> index 9cc891eb874e..03a2c526e2c1 100644 >>>>>>> --- a/fs/ocfs2/namei.h >>>>>>> +++ b/fs/ocfs2/namei.h >>>>>>> @@ -24,6 +24,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, >>>>>>> struct buffer_head *orphan_dir_bh, >>>>>>> bool dio); >>>>>>> int ocfs2_create_inode_in_orphan(struct inode *dir, >>>>>>> +struct buffer_head **dir_bh, >>>>>>> int mode, >>>>>>> struct inode **new_inode); >>>>>>> int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb, >>>>>>> @@ -32,6 +33,7 @@ int ocfs2_del_inode_from_orphan(struct >>>>>>> ocfs2_super *osb, >>>>>>> struct inode *inode, struct buffer_head *di_bh, >>>>>>> int update_isize, loff_t end); >>>>>>> int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, >>>>>>> + struct buffer_head *dir_bh, >>>>>>> struct inode *new_inode, >>>>>>> struct dentry *new_dentry); >>>>>>> >>>>>>> diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c >>>>>>> index 7f6355cbb587..a9a0c7c37e8e 100644 >>>>>>> --- a/fs/ocfs2/refcounttree.c >>>>>>> +++ b/fs/ocfs2/refcounttree.c >>>>>>> @@ -4250,7 +4250,7 @@ static int ocfs2_reflink(struct dentry >>>>>>> *old_dentry, struct inode *dir, >>>>>>> { >>>>>>> int error, had_lock; >>>>>>> struct inode *inode = d_inode(old_dentry); >>>>>>> -struct buffer_head *old_bh = NULL; >>>>>>> +struct buffer_head *old_bh = NULL, *dir_bh = NULL; >>>>>>> struct inode *new_orphan_inode = NULL; >>>>>>> struct ocfs2_lock_holder oh; >>>>>>> >>>>>>> @@ -4258,7 +4258,7 @@ static int ocfs2_reflink(struct dentry >>>>>>> *old_dentry, struct inode *dir, >>>>>>> return -EOPNOTSUPP; >>>>>>> >>>>>>> >>>>>>> -error = ocfs2_create_inode_in_orphan(dir, inode->i_mode, >>>>>>> +error = ocfs2_create_inode_in_orphan(dir, &dir_bh, inode->i_mode, >>>>>>> &new_orphan_inode); >>>>>>> if (error) { >>>>>>> mlog_errno(error); >>>>>>> @@ -4304,13 +4304,15 @@ static int ocfs2_reflink(struct dentry >>>>>>> *old_dentry, struct inode *dir, >>>>>>> >>>>>>> /* If the security isn't preserved, we need to re-initialize them. */ >>>>>>> if (!preserve) { >>>>>>> -error = ocfs2_init_security_and_acl(dir, new_orphan_inode, >>>>>>> +error = ocfs2_init_security_and_acl(dir, dir_bh, >>>>>>> + new_orphan_inode, >>>>>>> &new_dentry->d_name); >>>>>>> if (error) >>>>>>> mlog_errno(error); >>>>>>> } >>>>>>> if (!error) { >>>>>>> -error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode, >>>>>>> +error = ocfs2_mv_orphaned_inode_to_new(dir, dir_bh, >>>>>>> + new_orphan_inode, >>>>>>> new_dentry); >>>>>>> if (error) >>>>>>> mlog_errno(error); >>>>>>> @@ -4328,6 +4330,11 @@ static int ocfs2_reflink(struct dentry >>>>>>> *old_dentry, struct inode *dir, >>>>>>> iput(new_orphan_inode); >>>>>>> } >>>>>>> >>>>>>> +if (dir_bh) { >>>>>>> +ocfs2_inode_unlock(dir, 1); >>>>>>> +brelse(dir_bh); >>>>>>> +} >>>>>>> + >>>>>>> return error; >>>>>>> } >>>>>>> >>>>>>> diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c >>>>>>> index dd784eb0cd7c..3f23e3a5018c 100644 >>>>>>> --- a/fs/ocfs2/xattr.c >>>>>>> +++ b/fs/ocfs2/xattr.c >>>>>>> @@ -7203,16 +7203,13 @@ int ocfs2_reflink_xattrs(struct inode >>>>>>> *old_inode, >>>>>>> /* >>>>>>> * Initialize security and acl for a already created inode. >>>>>>> * Used for reflink a non-preserve-security file. >>>>>>> - * >>>>>>> - * It uses common api like ocfs2_xattr_set, so the caller >>>>>>> - * must not hold any lock expect i_mutex. >>>>>>> */ >>>>>>> int ocfs2_init_security_and_acl(struct inode *dir, >>>>>>> +struct buffer_head *dir_bh, >>>>>>> struct inode *inode, >>>>>>> const struct qstr *qstr) >>>>>>> { >>>>>>> int ret = 0; >>>>>>> -struct buffer_head *dir_bh = NULL; >>>>>>> >>>>>>> ret = ocfs2_init_security_get(inode, dir, qstr, NULL); >>>>>>> if (ret) { >>>>>>> @@ -7220,17 +7217,10 @@ int ocfs2_init_security_and_acl(struct >>>>>>> inode *dir, >>>>>>> goto leave; >>>>>>> } >>>>>>> >>>>>>> -ret = ocfs2_inode_lock(dir, &dir_bh, 0); >>>>>>> -if (ret) { >>>>>>> -mlog_errno(ret); >>>>>>> -goto leave; >>>>>>> -} >>>>>>> ret = ocfs2_init_acl(NULL, inode, dir, NULL, dir_bh, NULL, NULL); >>>>>>> if (ret) >>>>>>> mlog_errno(ret); >>>>>>> >>>>>>> -ocfs2_inode_unlock(dir, 0); >>>>>>> -brelse(dir_bh); >>>>>>> leave: >>>>>>> return ret; >>>>>>> } >>>>>>> diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h >>>>>>> index 00308b57f64f..b27fd8ba0019 100644 >>>>>>> --- a/fs/ocfs2/xattr.h >>>>>>> +++ b/fs/ocfs2/xattr.h >>>>>>> @@ -83,6 +83,7 @@ int ocfs2_reflink_xattrs(struct inode *old_inode, >>>>>>> struct buffer_head *new_bh, >>>>>>> bool preserve_security); >>>>>>> int ocfs2_init_security_and_acl(struct inode *dir, >>>>>>> +struct buffer_head *dir_bh, >>>>>>> struct inode *inode, >>>>>>> const struct qstr *qstr); >>>>>>> #endif /* OCFS2_XATTR_H */ >>>>>>> >>>>>> >>>>>> >>>>>> _______________________________________________ >>>>>> Ocfs2-devel mailing list >>>>>> Ocfs2-devel@oss.oracle.com <mailto:Ocfs2-devel@oss.oracle.com> >>>>>> https://oss.oracle.com/mailman/listinfo/ocfs2-devel >>>> >> >
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 2c46ff6ba4ea..f8bbb22cc60b 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -2489,6 +2489,7 @@ static int ocfs2_prep_new_orphaned_file(struct inode *dir, } int ocfs2_create_inode_in_orphan(struct inode *dir, + struct buffer_head **dir_bh, int mode, struct inode **new_inode) { @@ -2597,13 +2598,16 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, brelse(new_di_bh); - if (!status) - *new_inode = inode; - ocfs2_free_dir_lookup_result(&orphan_insert); - ocfs2_inode_unlock(dir, 1); - brelse(parent_di_bh); + if (!status) { + *new_inode = inode; + *dir_bh = parent_di_bh; + } else { + ocfs2_inode_unlock(dir, 1); + brelse(parent_di_bh); + } + return status; } @@ -2760,11 +2764,11 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb, } int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, + struct buffer_head *dir_bh, struct inode *inode, struct dentry *dentry) { int status = 0; - struct buffer_head *parent_di_bh = NULL; handle_t *handle = NULL; struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); struct ocfs2_dinode *dir_di, *di; @@ -2778,14 +2782,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, (unsigned long long)OCFS2_I(dir)->ip_blkno, (unsigned long long)OCFS2_I(inode)->ip_blkno); - status = ocfs2_inode_lock(dir, &parent_di_bh, 1); - if (status < 0) { - if (status != -ENOENT) - mlog_errno(status); - return status; - } - - dir_di = (struct ocfs2_dinode *) parent_di_bh->b_data; + dir_di = (struct ocfs2_dinode *) dir_bh->b_data; if (!dir_di->i_links_count) { /* can't make a file in a deleted directory. */ status = -ENOENT; @@ -2798,7 +2795,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, goto leave; /* get a spot inside the dir. */ - status = ocfs2_prepare_dir_for_insert(osb, dir, parent_di_bh, + status = ocfs2_prepare_dir_for_insert(osb, dir, dir_bh, dentry->d_name.name, dentry->d_name.len, &lookup); if (status < 0) { @@ -2862,7 +2859,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, ocfs2_journal_dirty(handle, di_bh); status = ocfs2_add_entry(handle, dentry, inode, - OCFS2_I(inode)->ip_blkno, parent_di_bh, + OCFS2_I(inode)->ip_blkno, dir_bh, &lookup); if (status < 0) { mlog_errno(status); @@ -2886,10 +2883,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, iput(orphan_dir_inode); leave: - ocfs2_inode_unlock(dir, 1); - brelse(di_bh); - brelse(parent_di_bh); brelse(orphan_dir_bh); ocfs2_free_dir_lookup_result(&lookup); diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h index 9cc891eb874e..03a2c526e2c1 100644 --- a/fs/ocfs2/namei.h +++ b/fs/ocfs2/namei.h @@ -24,6 +24,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, struct buffer_head *orphan_dir_bh, bool dio); int ocfs2_create_inode_in_orphan(struct inode *dir, + struct buffer_head **dir_bh, int mode, struct inode **new_inode); int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb, @@ -32,6 +33,7 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb, struct inode *inode, struct buffer_head *di_bh, int update_isize, loff_t end); int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, + struct buffer_head *dir_bh, struct inode *new_inode, struct dentry *new_dentry); diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 7f6355cbb587..a9a0c7c37e8e 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4250,7 +4250,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, { int error, had_lock; struct inode *inode = d_inode(old_dentry); - struct buffer_head *old_bh = NULL; + struct buffer_head *old_bh = NULL, *dir_bh = NULL; struct inode *new_orphan_inode = NULL; struct ocfs2_lock_holder oh; @@ -4258,7 +4258,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, return -EOPNOTSUPP; - error = ocfs2_create_inode_in_orphan(dir, inode->i_mode, + error = ocfs2_create_inode_in_orphan(dir, &dir_bh, inode->i_mode, &new_orphan_inode); if (error) { mlog_errno(error); @@ -4304,13 +4304,15 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, /* If the security isn't preserved, we need to re-initialize them. */ if (!preserve) { - error = ocfs2_init_security_and_acl(dir, new_orphan_inode, + error = ocfs2_init_security_and_acl(dir, dir_bh, + new_orphan_inode, &new_dentry->d_name); if (error) mlog_errno(error); } if (!error) { - error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode, + error = ocfs2_mv_orphaned_inode_to_new(dir, dir_bh, + new_orphan_inode, new_dentry); if (error) mlog_errno(error); @@ -4328,6 +4330,11 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, iput(new_orphan_inode); } + if (dir_bh) { + ocfs2_inode_unlock(dir, 1); + brelse(dir_bh); + } + return error; } diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index dd784eb0cd7c..3f23e3a5018c 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -7203,16 +7203,13 @@ int ocfs2_reflink_xattrs(struct inode *old_inode, /* * Initialize security and acl for a already created inode. * Used for reflink a non-preserve-security file. - * - * It uses common api like ocfs2_xattr_set, so the caller - * must not hold any lock expect i_mutex. */ int ocfs2_init_security_and_acl(struct inode *dir, + struct buffer_head *dir_bh, struct inode *inode, const struct qstr *qstr) { int ret = 0; - struct buffer_head *dir_bh = NULL; ret = ocfs2_init_security_get(inode, dir, qstr, NULL); if (ret) { @@ -7220,17 +7217,10 @@ int ocfs2_init_security_and_acl(struct inode *dir, goto leave; } - ret = ocfs2_inode_lock(dir, &dir_bh, 0); - if (ret) { - mlog_errno(ret); - goto leave; - } ret = ocfs2_init_acl(NULL, inode, dir, NULL, dir_bh, NULL, NULL); if (ret) mlog_errno(ret); - ocfs2_inode_unlock(dir, 0); - brelse(dir_bh); leave: return ret; } diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h index 00308b57f64f..b27fd8ba0019 100644 --- a/fs/ocfs2/xattr.h +++ b/fs/ocfs2/xattr.h @@ -83,6 +83,7 @@ int ocfs2_reflink_xattrs(struct inode *old_inode, struct buffer_head *new_bh, bool preserve_security); int ocfs2_init_security_and_acl(struct inode *dir, + struct buffer_head *dir_bh, struct inode *inode, const struct qstr *qstr); #endif /* OCFS2_XATTR_H */
During the reflink process, we should acquire the target directory inode dlm lock at the beginning, and hold this dlm lock until end of the function. With this patch, we avoid dlm lock ping-pong effect when clone files to the same directory simultaneously from multiple nodes. There is a typical user scenario, users regularly back up files to a specified directory through the reflink feature from the multiple nodes. Signed-off-by: Gang He <ghe@suse.com> --- fs/ocfs2/namei.c | 32 +++++++++++++------------------- fs/ocfs2/namei.h | 2 ++ fs/ocfs2/refcounttree.c | 15 +++++++++++---- fs/ocfs2/xattr.c | 12 +----------- fs/ocfs2/xattr.h | 1 + 5 files changed, 28 insertions(+), 34 deletions(-)