Message ID | 1250618822-6131-5-git-send-email-jlayton@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Why don't we have a reference to this inode already? We can't have an oplock break unless the file is open, if the file is open then we have a reference to the inode ... On Tue, Aug 18, 2009 at 1:07 PM, Jeff Layton <jlayton@redhat.com> wrote: > When an oplock break comes in, cifs needs to do things like write out > dirty data for it. It doesn't hold a reference to that inode in this > case however. Get an active reference to the inode when an oplock break > comes in. If we don't get a reference, we still need to create an oplock > queue entry so that the oplock release call gets done, but we'll want to > skip writeback in that case. > > Signed-off-by: Jeff Layton <jlayton@redhat.com> > --- > fs/cifs/cifsfs.c | 26 ++++++++++---------------- > fs/cifs/misc.c | 4 +++- > 2 files changed, 13 insertions(+), 17 deletions(-) > > diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c > index 2fcc722..647c5bc 100644 > --- a/fs/cifs/cifsfs.c > +++ b/fs/cifs/cifsfs.c > @@ -993,13 +993,8 @@ static int cifs_oplock_thread(void *dummyarg) > cFYI(1, ("found oplock item to write out")); > tcon = oplock->tcon; > inode = oplock->pinode; > - /* can not grab inode sem here since it would > - deadlock when oplock received on delete > - since vfs_unlink holds the i_mutex across > - the call */ > - /* mutex_lock(&inode->i_mutex);*/ > - cifsi = CIFS_I(inode); > - if (S_ISREG(inode->i_mode)) { > + if (inode && S_ISREG(inode->i_mode)) { > + cifsi = CIFS_I(inode); > #ifdef CONFIG_CIFS_EXPERIMENTAL > if (cifsi->clientCanCacheAll == 0) > break_lease(inode, FMODE_READ); > @@ -1010,17 +1005,16 @@ static int cifs_oplock_thread(void *dummyarg) > if (cifsi->clientCanCacheRead == 0) { > waitrc = filemap_fdatawait( > > inode->i_mapping); > + if (rc == 0) > + rc = waitrc; > invalidate_remote_inode(inode); > } > - if (rc == 0) > - rc = waitrc; > - } else > - rc = 0; > - /* mutex_unlock(&inode->i_mutex);*/ > - if (rc) > - cifsi->write_behind_rc = rc; > - cFYI(1, ("Oplock flush inode %p rc %d", > - inode, rc)); > + if (rc) > + cifsi->write_behind_rc = rc; > + cFYI(1, ("Oplock flush inode %p rc %d", > + inode, rc)); > + } > + iput(inode); > > /* > * releasing stale oplock after recent reconnect > diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c > index 7221af9..3bf3a52 100644 > --- a/fs/cifs/misc.c > +++ b/fs/cifs/misc.c > @@ -502,6 +502,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct > TCP_Server_Info *srv, > struct cifsInodeInfo *pCifsInode; > struct cifsFileInfo *netfile; > struct oplock_q_entry *oplock; > + struct inode *inode; > > cFYI(1, ("Checking for oplock break or dnotify response")); > if ((pSMB->hdr.Command == SMB_COM_NT_TRANSACT) && > @@ -600,6 +601,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct > TCP_Server_Info *srv, > pCifsInode->clientCanCacheAll = false; > if (pSMB->OplockLevel == 0) > pCifsInode->clientCanCacheRead = > false; > + inode = igrab(netfile->pInode); > > if (!oplock) { > cERROR(1, ("Unable to allocate " > @@ -608,7 +610,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct > TCP_Server_Info *srv, > } > > oplock->tcon = tcon; > - oplock->pinode = netfile->pInode; > + oplock->pinode = inode; > oplock->netfid = netfile->netfid; > spin_lock(&cifs_oplock_lock); > list_add_tail(&oplock->qhead, > -- > 1.6.0.6 > >
On Thu, 20 Aug 2009 19:42:42 -0500 Steve French <smfrench@gmail.com> wrote: > Why don't we have a reference to this inode already? We can't have an > oplock break unless the file is open, if the file is open then we have a > reference to the inode ... > Well, you have a reference when the entry goes onto the list. There's no guarantee that you'll still have one when cifs_oplock_thread goes to take it off the list however. The file could have been closed by then and the inode flushed out of the cache. Since the cifs_oplock_thread happens outside of any "normal" process behavior, we really need to take an inode reference here. > On Tue, Aug 18, 2009 at 1:07 PM, Jeff Layton <jlayton@redhat.com> wrote: > > > When an oplock break comes in, cifs needs to do things like write out > > dirty data for it. It doesn't hold a reference to that inode in this > > case however. Get an active reference to the inode when an oplock break > > comes in. If we don't get a reference, we still need to create an oplock > > queue entry so that the oplock release call gets done, but we'll want to > > skip writeback in that case. > > > > Signed-off-by: Jeff Layton <jlayton@redhat.com> > > --- > > fs/cifs/cifsfs.c | 26 ++++++++++---------------- > > fs/cifs/misc.c | 4 +++- > > 2 files changed, 13 insertions(+), 17 deletions(-) > > > > diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c > > index 2fcc722..647c5bc 100644 > > --- a/fs/cifs/cifsfs.c > > +++ b/fs/cifs/cifsfs.c > > @@ -993,13 +993,8 @@ static int cifs_oplock_thread(void *dummyarg) > > cFYI(1, ("found oplock item to write out")); > > tcon = oplock->tcon; > > inode = oplock->pinode; > > - /* can not grab inode sem here since it would > > - deadlock when oplock received on delete > > - since vfs_unlink holds the i_mutex across > > - the call */ > > - /* mutex_lock(&inode->i_mutex);*/ > > - cifsi = CIFS_I(inode); > > - if (S_ISREG(inode->i_mode)) { > > + if (inode && S_ISREG(inode->i_mode)) { > > + cifsi = CIFS_I(inode); > > #ifdef CONFIG_CIFS_EXPERIMENTAL > > if (cifsi->clientCanCacheAll == 0) > > break_lease(inode, FMODE_READ); > > @@ -1010,17 +1005,16 @@ static int cifs_oplock_thread(void *dummyarg) > > if (cifsi->clientCanCacheRead == 0) { > > waitrc = filemap_fdatawait( > > > > inode->i_mapping); > > + if (rc == 0) > > + rc = waitrc; > > invalidate_remote_inode(inode); > > } > > - if (rc == 0) > > - rc = waitrc; > > - } else > > - rc = 0; > > - /* mutex_unlock(&inode->i_mutex);*/ > > - if (rc) > > - cifsi->write_behind_rc = rc; > > - cFYI(1, ("Oplock flush inode %p rc %d", > > - inode, rc)); > > + if (rc) > > + cifsi->write_behind_rc = rc; > > + cFYI(1, ("Oplock flush inode %p rc %d", > > + inode, rc)); > > + } > > + iput(inode); > > > > /* > > * releasing stale oplock after recent reconnect > > diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c > > index 7221af9..3bf3a52 100644 > > --- a/fs/cifs/misc.c > > +++ b/fs/cifs/misc.c > > @@ -502,6 +502,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct > > TCP_Server_Info *srv, > > struct cifsInodeInfo *pCifsInode; > > struct cifsFileInfo *netfile; > > struct oplock_q_entry *oplock; > > + struct inode *inode; > > > > cFYI(1, ("Checking for oplock break or dnotify response")); > > if ((pSMB->hdr.Command == SMB_COM_NT_TRANSACT) && > > @@ -600,6 +601,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct > > TCP_Server_Info *srv, > > pCifsInode->clientCanCacheAll = false; > > if (pSMB->OplockLevel == 0) > > pCifsInode->clientCanCacheRead = > > false; > > + inode = igrab(netfile->pInode); > > > > if (!oplock) { > > cERROR(1, ("Unable to allocate " > > @@ -608,7 +610,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct > > TCP_Server_Info *srv, > > } > > > > oplock->tcon = tcon; > > - oplock->pinode = netfile->pInode; > > + oplock->pinode = inode; > > oplock->netfid = netfile->netfid; > > spin_lock(&cifs_oplock_lock); > > list_add_tail(&oplock->qhead, > > -- > > 1.6.0.6 > > > > > >
On Fri, Aug 21, 2009 at 5:36 AM, Jeff Layton <jlayton@redhat.com> wrote: > On Thu, 20 Aug 2009 19:42:42 -0500 > Steve French <smfrench@gmail.com> wrote: > > > Why don't we have a reference to this inode already? We can't have an > > oplock break unless the file is open, if the file is open then we have a > > reference to the inode ... > > > > Well, you have a reference when the entry goes onto the list. There's > no guarantee that you'll still have one when cifs_oplock_thread goes to > take it off the list however. The file could have been closed by then > An oplock response is handle based so we need to make sure the fid is valid (if not, throw away the oplock response, except for the case of batch oplock which we don't use yet). Seems odd to lock the inode when we really need the fid (file struct) We could mark the file struct (similar to the write pending flag) and delete such an entry off the oplockq in close (if we are closing a file with an oplock pending)
On Fri, 21 Aug 2009 10:18:16 -0500 Steve French <smfrench@gmail.com> wrote: > On Fri, Aug 21, 2009 at 5:36 AM, Jeff Layton <jlayton@redhat.com> wrote: > > > On Thu, 20 Aug 2009 19:42:42 -0500 > > Steve French <smfrench@gmail.com> wrote: > > > > > Why don't we have a reference to this inode already? We can't have an > > > oplock break unless the file is open, if the file is open then we have a > > > reference to the inode ... > > > > > > > Well, you have a reference when the entry goes onto the list. There's > > no guarantee that you'll still have one when cifs_oplock_thread goes to > > take it off the list however. The file could have been closed by then > > > > An oplock response is handle based so we need to make sure the fid > is valid (if not, throw away the oplock response, except for the case > of batch oplock which we don't use yet). Seems odd to lock the > inode when we really need the fid (file struct) > True. Though that's sort of a design issue with the oplock queue itself. The entry tracks an inode and not a filp. The goal with this set is to fix the use-after-free problems without changing the underlying design too much. I'm more inclined to leave this as-is until the whole approach can be redesigned. > We could mark the file struct (similar to the write pending flag) and > delete such an entry off the oplockq in close (if we are closing > a file with an oplock pending) > As long as you can do so in a non-racy way, then I'm not opposed to that long-term. The problem though is that I don't have a lot of confidence in the open file tracking code. It's extremely hard to follow and definitely has races. I don't think it's really possible to do what you suggest safely until the open file tracking code has been fixed. For now, I'm pretty sure this set should fix the problems that users are hitting in the oplock codepath today. I'd like to fix that first before we embark on a redesign of it.
On Fri, Aug 21, 2009 at 10:48 AM, Jeff Layton <jlayton@redhat.com> wrote: > > As long as you can do so in a non-racy way, then I'm not opposed to > that long-term. The problem though is that I don't have a lot of > confidence in the open file tracking code. It's extremely hard to > follow and definitely has races. I don't think it's really possible to > do what you suggest safely until the open file tracking code has been > fixed. > > For now, I'm pretty sure this set should fix the problems that users > are hitting in the oplock codepath today. I'd like to fix that first > before we embark on a redesign of it. > You may be right that this should be two stages, but your 2nd and 3rd patch are already large, so I doubt that it would grow (and might make it easier to follow and more logical).
On Fri, Aug 21, 2009 at 2:01 PM, Steve French <smfrench@gmail.com> wrote: > > > On Fri, Aug 21, 2009 at 10:48 AM, Jeff Layton <jlayton@redhat.com> wrote: > >> >> As long as you can do so in a non-racy way, then I'm not opposed to >> that long-term. The problem though is that I don't have a lot of >> confidence in the open file tracking code. It's extremely hard to >> follow and definitely has races. I don't think it's really possible to >> do what you suggest safely until the open file tracking code has been >> fixed. >> >> For now, I'm pretty sure this set should fix the problems that users >> are hitting in the oplock codepath today. I'd like to fix that first >> before we embark on a redesign of it. >> > > You may be right that this should be two stages, but your 2nd and 3rd patch > are already large, so I doubt that it would grow (and might make it easier > to follow and more logical). > There is one other thing to consider (which might lean toward your inode based approach rather than what I suggested about tie of the oplock to the file struct for the fid) ... we need to move to 1st attempting "batch oplocks" (as was discussed in June) to allow safer caching across close (and would also helps with multiple opens from the same client) - this will necessitate making the routine which frees inodes aware of oplocks. In the short run I was planning on trying this approach (grab one batch oplock on open, and only rely on the other type of oplocks when we can't get or lose the batch oplock) with the smb2 code and after a few months if it works ok consider fixing the cifs code. In the meantime if we uses patches 2 through 4 as is (one and five are obviously fine), it would be great if we could get at least one more ack. I haven't found any problems yet but they are big.
On Fri, 21 Aug 2009 14:08:24 -0500 Steve French <smfrench@gmail.com> wrote: > On Fri, Aug 21, 2009 at 2:01 PM, Steve French <smfrench@gmail.com> wrote: > > > > > > > On Fri, Aug 21, 2009 at 10:48 AM, Jeff Layton <jlayton@redhat.com> wrote: > > > >> > >> As long as you can do so in a non-racy way, then I'm not opposed to > >> that long-term. The problem though is that I don't have a lot of > >> confidence in the open file tracking code. It's extremely hard to > >> follow and definitely has races. I don't think it's really possible to > >> do what you suggest safely until the open file tracking code has been > >> fixed. > >> > >> For now, I'm pretty sure this set should fix the problems that users > >> are hitting in the oplock codepath today. I'd like to fix that first > >> before we embark on a redesign of it. > >> > > > > You may be right that this should be two stages, but your 2nd and 3rd patch > > are already large, so I doubt that it would grow (and might make it easier > > to follow and more logical). > > > > There is one other thing to consider (which might lean toward your inode > based approach rather than what I suggested about tie of the > oplock to the file struct for the fid) ... we need to move to 1st attempting > "batch oplocks" (as was discussed in June) to allow safer caching > across close (and would also helps with multiple opens from the same > client) - this will necessitate making the routine which frees inodes > aware of oplocks. In the short run I was planning on trying this approach > (grab one batch oplock on open, and only rely on the other type of oplocks > when we can't get or lose the batch oplock) with the smb2 code and > after a few months if it works ok consider fixing the cifs code. In the > meantime if we uses patches 2 through 4 as is (one and five are > obviously fine), it would be great if we could get at least one more ack. > I haven't found any problems yet but they are big. > > Sorry about the size, but there were a number of races and potential use-after-free holes in that code. I tried to do the set so that it made a logical progression of changes, but the patches do touch the same code numerous times. If it would be more helpful, I can send you the set as a single large patch. Thanks again for reviewing it promptly.
On Fri, 21 Aug 2009 14:01:21 -0500 Steve French <smfrench@gmail.com> wrote: > On Fri, Aug 21, 2009 at 10:48 AM, Jeff Layton <jlayton@redhat.com> wrote: > > > > > As long as you can do so in a non-racy way, then I'm not opposed to > > that long-term. The problem though is that I don't have a lot of > > confidence in the open file tracking code. It's extremely hard to > > follow and definitely has races. I don't think it's really possible to > > do what you suggest safely until the open file tracking code has been > > fixed. > > > > For now, I'm pretty sure this set should fix the problems that users > > are hitting in the oplock codepath today. I'd like to fix that first > > before we embark on a redesign of it. > > > > You may be right that this should be two stages, but your 2nd and 3rd patch > are already large, so I doubt that it would grow (and might make it easier > to follow and more logical). > Any more thoughts on this patchset? I've been looking over what it would take to fix up the open file handle tracking and it's going to be a substantial set of patches (on the order of the size of the cifs_iget patchset). I'd still like to do that, but don't think we can wait until then to fix this set of problems.
On Tue, Aug 18, 2009 at 1:07 PM, Jeff Layton<jlayton@redhat.com> wrote: > When an oplock break comes in, cifs needs to do things like write out > dirty data for it. It doesn't hold a reference to that inode in this > case however. Get an active reference to the inode when an oplock break > comes in. If we don't get a reference, we still need to create an oplock > queue entry so that the oplock release call gets done, but we'll want to > skip writeback in that case. > > Signed-off-by: Jeff Layton <jlayton@redhat.com> > --- >  fs/cifs/cifsfs.c |  26 ++++++++++---------------- >  fs/cifs/misc.c  |   4 +++- >  2 files changed, 13 insertions(+), 17 deletions(-) > > diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c > index 2fcc722..647c5bc 100644 > --- a/fs/cifs/cifsfs.c > +++ b/fs/cifs/cifsfs.c > @@ -993,13 +993,8 @@ static int cifs_oplock_thread(void *dummyarg) >             cFYI(1, ("found oplock item to write out")); >             tcon = oplock->tcon; >             inode = oplock->pinode; > -            /* can not grab inode sem here since it would > -                deadlock when oplock received on delete > -                since vfs_unlink holds the i_mutex across > -                the call */ > -            /* mutex_lock(&inode->i_mutex);*/ > -            cifsi = CIFS_I(inode); > -            if (S_ISREG(inode->i_mode)) { > +            if (inode && S_ISREG(inode->i_mode)) { > +                cifsi = CIFS_I(inode); >  #ifdef CONFIG_CIFS_EXPERIMENTAL >                 if (cifsi->clientCanCacheAll == 0) >                     break_lease(inode, FMODE_READ); > @@ -1010,17 +1005,16 @@ static int cifs_oplock_thread(void *dummyarg) >                 if (cifsi->clientCanCacheRead == 0) { >                     waitrc = filemap_fdatawait( >                                inode->i_mapping); > +                    if (rc == 0) > +                        rc = waitrc; >                     invalidate_remote_inode(inode); >                 } > -                if (rc == 0) > -                    rc = waitrc; > -            } else > -                rc = 0; > -            /* mutex_unlock(&inode->i_mutex);*/ > -            if (rc) > -                cifsi->write_behind_rc = rc; > -            cFYI(1, ("Oplock flush inode %p rc %d", > -                inode, rc)); > +                if (rc) > +                    cifsi->write_behind_rc = rc; > +                cFYI(1, ("Oplock flush inode %p rc %d", > +                     inode, rc)); > +            } > +            iput(inode); > >             /* >             * releasing stale oplock after recent reconnect > diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c > index 7221af9..3bf3a52 100644 > --- a/fs/cifs/misc.c > +++ b/fs/cifs/misc.c > @@ -502,6 +502,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv, >     struct cifsInodeInfo *pCifsInode; >     struct cifsFileInfo *netfile; >     struct oplock_q_entry *oplock; > +    struct inode *inode; > >     cFYI(1, ("Checking for oplock break or dnotify response")); >     if ((pSMB->hdr.Command == SMB_COM_NT_TRANSACT) && > @@ -600,6 +601,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv, >                 pCifsInode->clientCanCacheAll = false; >                 if (pSMB->OplockLevel == 0) >                     pCifsInode->clientCanCacheRead = false; > +                inode = igrab(netfile->pInode); > >                 if (!oplock) { >                     cERROR(1, ("Unable to allocate " > @@ -608,7 +610,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv, >                 } > >                 oplock->tcon = tcon; > -                oplock->pinode = netfile->pInode; > +                oplock->pinode = inode; >                 oplock->netfid = netfile->netfid; >                 spin_lock(&cifs_oplock_lock); >                 list_add_tail(&oplock->qhead, > -- > 1.6.0.6 > > _______________________________________________ > linux-cifs-client mailing list > linux-cifs-client@lists.samba.org > https://lists.samba.org/mailman/listinfo/linux-cifs-client > Acked-by: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 2fcc722..647c5bc 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -993,13 +993,8 @@ static int cifs_oplock_thread(void *dummyarg) cFYI(1, ("found oplock item to write out")); tcon = oplock->tcon; inode = oplock->pinode; - /* can not grab inode sem here since it would - deadlock when oplock received on delete - since vfs_unlink holds the i_mutex across - the call */ - /* mutex_lock(&inode->i_mutex);*/ - cifsi = CIFS_I(inode); - if (S_ISREG(inode->i_mode)) { + if (inode && S_ISREG(inode->i_mode)) { + cifsi = CIFS_I(inode); #ifdef CONFIG_CIFS_EXPERIMENTAL if (cifsi->clientCanCacheAll == 0) break_lease(inode, FMODE_READ); @@ -1010,17 +1005,16 @@ static int cifs_oplock_thread(void *dummyarg) if (cifsi->clientCanCacheRead == 0) { waitrc = filemap_fdatawait( inode->i_mapping); + if (rc == 0) + rc = waitrc; invalidate_remote_inode(inode); } - if (rc == 0) - rc = waitrc; - } else - rc = 0; - /* mutex_unlock(&inode->i_mutex);*/ - if (rc) - cifsi->write_behind_rc = rc; - cFYI(1, ("Oplock flush inode %p rc %d", - inode, rc)); + if (rc) + cifsi->write_behind_rc = rc; + cFYI(1, ("Oplock flush inode %p rc %d", + inode, rc)); + } + iput(inode); /* * releasing stale oplock after recent reconnect diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 7221af9..3bf3a52 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -502,6 +502,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv, struct cifsInodeInfo *pCifsInode; struct cifsFileInfo *netfile; struct oplock_q_entry *oplock; + struct inode *inode; cFYI(1, ("Checking for oplock break or dnotify response")); if ((pSMB->hdr.Command == SMB_COM_NT_TRANSACT) && @@ -600,6 +601,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv, pCifsInode->clientCanCacheAll = false; if (pSMB->OplockLevel == 0) pCifsInode->clientCanCacheRead = false; + inode = igrab(netfile->pInode); if (!oplock) { cERROR(1, ("Unable to allocate " @@ -608,7 +610,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv, } oplock->tcon = tcon; - oplock->pinode = netfile->pInode; + oplock->pinode = inode; oplock->netfid = netfile->netfid; spin_lock(&cifs_oplock_lock); list_add_tail(&oplock->qhead,
When an oplock break comes in, cifs needs to do things like write out dirty data for it. It doesn't hold a reference to that inode in this case however. Get an active reference to the inode when an oplock break comes in. If we don't get a reference, we still need to create an oplock queue entry so that the oplock release call gets done, but we'll want to skip writeback in that case. Signed-off-by: Jeff Layton <jlayton@redhat.com> --- fs/cifs/cifsfs.c | 26 ++++++++++---------------- fs/cifs/misc.c | 4 +++- 2 files changed, 13 insertions(+), 17 deletions(-)