Message ID | 20220223211305.296816-6-trondmy@kernel.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Readdir improvements | expand |
On 23 Feb 2022, at 16:12, trondmy@kernel.org wrote: > From: Trond Myklebust <trond.myklebust@hammerspace.com> > > Use the change attribute and the first cookie in a directory page > cache > entry to validate that the page is up to date. > > Suggested-by: Benjamin Coddington <bcodding@redhat.com> > Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com> > --- > fs/nfs/dir.c | 68 > ++++++++++++++++++++++++++++------------------------ > 1 file changed, 37 insertions(+), 31 deletions(-) > > diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c > index f2258e926df2..5d9367d9b651 100644 > --- a/fs/nfs/dir.c > +++ b/fs/nfs/dir.c > @@ -139,6 +139,7 @@ struct nfs_cache_array_entry { > }; > > struct nfs_cache_array { > + u64 change_attr; > u64 last_cookie; > unsigned int size; > unsigned char page_full : 1, > @@ -175,7 +176,8 @@ static void nfs_readdir_array_init(struct > nfs_cache_array *array) > memset(array, 0, sizeof(struct nfs_cache_array)); > } > > -static void nfs_readdir_page_init_array(struct page *page, u64 > last_cookie) > +static void nfs_readdir_page_init_array(struct page *page, u64 > last_cookie, > + u64 change_attr) > { > struct nfs_cache_array *array; There's a hunk missing here, something like: @@ -185,6 +185,7 @@ static void nfs_readdir_page_init_array(struct page *page, u64 last_cookie, nfs_readdir_array_init(array); array->last_cookie = last_cookie; array->cookies_are_ordered = 1; + array->change_attr = change_attr; kunmap_atomic(array); } > > @@ -207,7 +209,7 @@ nfs_readdir_page_array_alloc(u64 last_cookie, > gfp_t gfp_flags) > { > struct page *page = alloc_page(gfp_flags); > if (page) > - nfs_readdir_page_init_array(page, last_cookie); > + nfs_readdir_page_init_array(page, last_cookie, 0); > return page; > } > > @@ -304,19 +306,44 @@ int nfs_readdir_add_to_array(struct nfs_entry > *entry, struct page *page) > return ret; > } > > +static bool nfs_readdir_page_cookie_match(struct page *page, u64 > last_cookie, > + u64 change_attr) How about "nfs_readdir_page_valid()"? There's more going on than a cookie match. > +{ > + struct nfs_cache_array *array = kmap_atomic(page); > + int ret = true; > + > + if (array->change_attr != change_attr) > + ret = false; Can we skip the next test if ret = false? > + if (array->size > 0 && array->array[0].cookie != last_cookie) > + ret = false; > + kunmap_atomic(array); > + return ret; > +} > + > +static void nfs_readdir_page_unlock_and_put(struct page *page) > +{ > + unlock_page(page); > + put_page(page); > +} > + > static struct page *nfs_readdir_page_get_locked(struct address_space > *mapping, > pgoff_t index, u64 last_cookie) > { > struct page *page; > + u64 change_attr; > > page = grab_cache_page(mapping, index); > - if (page && !PageUptodate(page)) { > - nfs_readdir_page_init_array(page, last_cookie); > - if (invalidate_inode_pages2_range(mapping, index + 1, -1) < 0) > - nfs_zap_mapping(mapping->host, mapping); > - SetPageUptodate(page); > + if (!page) > + return NULL; > + change_attr = inode_peek_iversion_raw(mapping->host); > + if (PageUptodate(page)) { > + if (nfs_readdir_page_cookie_match(page, last_cookie, > + change_attr)) > + return page; > + nfs_readdir_clear_array(page); Why use i_version rather than nfs_save_change_attribute? Seems having a consistent value across the pachecache and dir_verifiers would help debugging, and we've already have a bunch of machinery around the change_attribute. Don't we need to send a GETATTR with READDIR for v4? Not doing so means that the pagecache is going to behave differently for v3 and v4, and we'll potentially end up with totally bogus listings for cases where one reader has cached a page of entries in the middle of the pagecache marked with i_version A, but entries are actually from i_version A++ on the server. Then another reader comes along and follows earlier entries from i_version A on the server that lead into entries from A++. I don't think we can detect this case unless we're checking the directory on every READDIR. Sending a GETATTR for v4 doesn't eliminate that race on the server side, but does remove the large window on the client created by the attribute cache timeouts, and I think its mostly harmless performance-wise. Also, we don't need the local change_attr variable just to pass it to other functions that can access it themselves. > } > - > + nfs_readdir_page_init_array(page, last_cookie, change_attr); > + SetPageUptodate(page); > return page; > } > > @@ -356,12 +383,6 @@ static void nfs_readdir_page_set_eof(struct page > *page) > kunmap_atomic(array); > } > > -static void nfs_readdir_page_unlock_and_put(struct page *page) > -{ > - unlock_page(page); > - put_page(page); > -} > - > static struct page *nfs_readdir_page_get_next(struct address_space > *mapping, > pgoff_t index, u64 cookie) > { > @@ -418,16 +439,6 @@ static int nfs_readdir_search_for_pos(struct > nfs_cache_array *array, > return -EBADCOOKIE; > } > > -static bool > -nfs_readdir_inode_mapping_valid(struct nfs_inode *nfsi) > -{ > - if (nfsi->cache_validity & (NFS_INO_INVALID_CHANGE | > - NFS_INO_INVALID_DATA)) > - return false; > - smp_rmb(); > - return !test_bit(NFS_INO_INVALIDATING, &nfsi->flags); > -} > - > static bool nfs_readdir_array_cookie_in_range(struct nfs_cache_array > *array, > u64 cookie) > { > @@ -456,8 +467,7 @@ static int nfs_readdir_search_for_cookie(struct > nfs_cache_array *array, > struct nfs_inode *nfsi = NFS_I(file_inode(desc->file)); > > new_pos = nfs_readdir_page_offset(desc->page) + i; > - if (desc->attr_gencount != nfsi->attr_gencount || > - !nfs_readdir_inode_mapping_valid(nfsi)) { > + if (desc->attr_gencount != nfsi->attr_gencount) { > desc->duped = 0; > desc->attr_gencount = nfsi->attr_gencount; > } else if (new_pos < desc->prev_index) { > @@ -1094,11 +1104,7 @@ static int nfs_readdir(struct file *file, > struct dir_context *ctx) > * to either find the entry with the appropriate number or > * revalidate the cookie. > */ > - if (ctx->pos == 0 || nfs_attribute_cache_expired(inode)) { > - res = nfs_revalidate_mapping(inode, file->f_mapping); > - if (res < 0) > - goto out; > - } > + nfs_revalidate_inode(inode, NFS_INO_INVALID_CHANGE); Same as above -> why not send GETATTR with READDIR instead of doing it in a separate RPC? Ben
On Thu, 2022-02-24 at 09:53 -0500, Benjamin Coddington wrote: > On 23 Feb 2022, at 16:12, trondmy@kernel.org wrote: > > > From: Trond Myklebust <trond.myklebust@hammerspace.com> > > > > Use the change attribute and the first cookie in a directory page > > cache > > entry to validate that the page is up to date. > > > > Suggested-by: Benjamin Coddington <bcodding@redhat.com> > > Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com> > > --- > > fs/nfs/dir.c | 68 > > ++++++++++++++++++++++++++++------------------------ > > 1 file changed, 37 insertions(+), 31 deletions(-) > > > > diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c > > index f2258e926df2..5d9367d9b651 100644 > > --- a/fs/nfs/dir.c > > +++ b/fs/nfs/dir.c > > @@ -139,6 +139,7 @@ struct nfs_cache_array_entry { > > }; > > > > struct nfs_cache_array { > > + u64 change_attr; > > u64 last_cookie; > > unsigned int size; > > unsigned char page_full : 1, > > @@ -175,7 +176,8 @@ static void nfs_readdir_array_init(struct > > nfs_cache_array *array) > > memset(array, 0, sizeof(struct nfs_cache_array)); > > } > > > > -static void nfs_readdir_page_init_array(struct page *page, u64 > > last_cookie) > > +static void nfs_readdir_page_init_array(struct page *page, u64 > > last_cookie, > > + u64 change_attr) > > { > > struct nfs_cache_array *array; > > > There's a hunk missing here, something like: > > @@ -185,6 +185,7 @@ static void nfs_readdir_page_init_array(struct > page > *page, u64 last_cookie, > nfs_readdir_array_init(array); > array->last_cookie = last_cookie; > array->cookies_are_ordered = 1; > + array->change_attr = change_attr; > kunmap_atomic(array); > } > > > > > @@ -207,7 +209,7 @@ nfs_readdir_page_array_alloc(u64 last_cookie, > > gfp_t gfp_flags) > > { > > struct page *page = alloc_page(gfp_flags); > > if (page) > > - nfs_readdir_page_init_array(page, last_cookie); > > + nfs_readdir_page_init_array(page, last_cookie, 0); > > return page; > > } > > > > @@ -304,19 +306,44 @@ int nfs_readdir_add_to_array(struct nfs_entry > > *entry, struct page *page) > > return ret; > > } > > > > +static bool nfs_readdir_page_cookie_match(struct page *page, u64 > > last_cookie, > > + u64 change_attr) > > How about "nfs_readdir_page_valid()"? There's more going on than a > cookie match. > > > > +{ > > + struct nfs_cache_array *array = kmap_atomic(page); > > + int ret = true; > > + > > + if (array->change_attr != change_attr) > > + ret = false; > > Can we skip the next test if ret = false? I'd expect the compiler to do that. > > > + if (array->size > 0 && array->array[0].cookie != > > last_cookie) > > + ret = false; > > + kunmap_atomic(array); > > + return ret; > > +} > > + > > +static void nfs_readdir_page_unlock_and_put(struct page *page) > > +{ > > + unlock_page(page); > > + put_page(page); > > +} > > + > > static struct page *nfs_readdir_page_get_locked(struct > > address_space > > *mapping, > > pgoff_t index, u64 > > last_cookie) > > { > > struct page *page; > > + u64 change_attr; > > > > page = grab_cache_page(mapping, index); > > - if (page && !PageUptodate(page)) { > > - nfs_readdir_page_init_array(page, last_cookie); > > - if (invalidate_inode_pages2_range(mapping, index + > > 1, -1) < 0) > > - nfs_zap_mapping(mapping->host, mapping); > > - SetPageUptodate(page); > > + if (!page) > > + return NULL; > > + change_attr = inode_peek_iversion_raw(mapping->host); > > + if (PageUptodate(page)) { > > + if (nfs_readdir_page_cookie_match(page, > > last_cookie, > > + change_attr)) > > + return page; > > + nfs_readdir_clear_array(page); > > > Why use i_version rather than nfs_save_change_attribute? Seems > having a > consistent value across the pachecache and dir_verifiers would help > debugging, and we've already have a bunch of machinery around the > change_attribute. The directory cache_change_attribute is not reported in tracepoints because it is a directory-specific field, so it's not as useful for debugging. The inode change attribute is what we have traditionally used for determining cache consistency, and when to invalidate the cache. > > Don't we need to send a GETATTR with READDIR for v4? Not doing so > means > that the pagecache is going to behave differently for v3 and v4, and > we'll > potentially end up with totally bogus listings for cases where one > reader > has cached a page of entries in the middle of the pagecache marked > with > i_version A, but entries are actually from i_version A++ on the > server. > Then another reader comes along and follows earlier entries from > i_version A > on the server that lead into entries from A++. I don't think we can > detect > this case unless we're checking the directory on every READDIR. The value of the change attribute is determined when the page is allocated. That value is unaffected by the READDIR call. That works with NFSv2 as well as NFSv3+v4. > > Sending a GETATTR for v4 doesn't eliminate that race on the server > side, > but > does remove the large window on the client created by the attribute > cache > timeouts, and I think its mostly harmless performance-wise. > > Also, we don't need the local change_attr variable just to pass it to > other > functions that can access it themselves. > > > } > > - > > + nfs_readdir_page_init_array(page, last_cookie, > > change_attr); > > + SetPageUptodate(page); > > return page; > > } > > > > @@ -356,12 +383,6 @@ static void nfs_readdir_page_set_eof(struct > > page > > *page) > > kunmap_atomic(array); > > } > > > > -static void nfs_readdir_page_unlock_and_put(struct page *page) > > -{ > > - unlock_page(page); > > - put_page(page); > > -} > > - > > static struct page *nfs_readdir_page_get_next(struct address_space > > *mapping, > > pgoff_t index, u64 > > cookie) > > { > > @@ -418,16 +439,6 @@ static int nfs_readdir_search_for_pos(struct > > nfs_cache_array *array, > > return -EBADCOOKIE; > > } > > > > -static bool > > -nfs_readdir_inode_mapping_valid(struct nfs_inode *nfsi) > > -{ > > - if (nfsi->cache_validity & (NFS_INO_INVALID_CHANGE | > > - NFS_INO_INVALID_DATA)) > > - return false; > > - smp_rmb(); > > - return !test_bit(NFS_INO_INVALIDATING, &nfsi->flags); > > -} > > - > > static bool nfs_readdir_array_cookie_in_range(struct > > nfs_cache_array > > *array, > > u64 cookie) > > { > > @@ -456,8 +467,7 @@ static int nfs_readdir_search_for_cookie(struct > > nfs_cache_array *array, > > struct nfs_inode *nfsi = > > NFS_I(file_inode(desc->file)); > > > > new_pos = nfs_readdir_page_offset(desc- > > >page) + i; > > - if (desc->attr_gencount != nfsi- > > >attr_gencount || > > - !nfs_readdir_inode_mapping_valid(nfsi)) > > { > > + if (desc->attr_gencount != nfsi- > > >attr_gencount) { > > desc->duped = 0; > > desc->attr_gencount = nfsi- > > >attr_gencount; > > } else if (new_pos < desc->prev_index) { > > @@ -1094,11 +1104,7 @@ static int nfs_readdir(struct file *file, > > struct dir_context *ctx) > > * to either find the entry with the appropriate number or > > * revalidate the cookie. > > */ > > - if (ctx->pos == 0 || nfs_attribute_cache_expired(inode)) { > > - res = nfs_revalidate_mapping(inode, file- > > >f_mapping); > > - if (res < 0) > > - goto out; > > - } > > + nfs_revalidate_inode(inode, NFS_INO_INVALID_CHANGE); > > Same as above -> why not send GETATTR with READDIR instead of doing > it > in a > separate RPC?
On Fri, 2022-02-25 at 02:26 +0000, Trond Myklebust wrote: > On Thu, 2022-02-24 at 09:53 -0500, Benjamin Coddington wrote: > > On 23 Feb 2022, at 16:12, trondmy@kernel.org wrote: > > > > > From: Trond Myklebust <trond.myklebust@hammerspace.com> > > > > > > Use the change attribute and the first cookie in a directory page > > > cache > > > entry to validate that the page is up to date. > > > > > > Suggested-by: Benjamin Coddington <bcodding@redhat.com> > > > Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com> > > > --- > > > fs/nfs/dir.c | 68 > > > ++++++++++++++++++++++++++++------------------------ > > > 1 file changed, 37 insertions(+), 31 deletions(-) > > > > > > diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c > > > index f2258e926df2..5d9367d9b651 100644 > > > --- a/fs/nfs/dir.c > > > +++ b/fs/nfs/dir.c > > > @@ -139,6 +139,7 @@ struct nfs_cache_array_entry { > > > }; > > > > > > struct nfs_cache_array { > > > + u64 change_attr; > > > u64 last_cookie; > > > unsigned int size; > > > unsigned char page_full : 1, > > > @@ -175,7 +176,8 @@ static void nfs_readdir_array_init(struct > > > nfs_cache_array *array) > > > memset(array, 0, sizeof(struct nfs_cache_array)); > > > } > > > > > > -static void nfs_readdir_page_init_array(struct page *page, u64 > > > last_cookie) > > > +static void nfs_readdir_page_init_array(struct page *page, u64 > > > last_cookie, > > > + u64 change_attr) > > > { > > > struct nfs_cache_array *array; > > > > > > There's a hunk missing here, something like: > > > > @@ -185,6 +185,7 @@ static void nfs_readdir_page_init_array(struct > > page > > *page, u64 last_cookie, > > nfs_readdir_array_init(array); > > array->last_cookie = last_cookie; > > array->cookies_are_ordered = 1; > > + array->change_attr = change_attr; > > kunmap_atomic(array); > > } > > > > > > > > @@ -207,7 +209,7 @@ nfs_readdir_page_array_alloc(u64 last_cookie, > > > gfp_t gfp_flags) > > > { > > > struct page *page = alloc_page(gfp_flags); > > > if (page) > > > - nfs_readdir_page_init_array(page, last_cookie); > > > + nfs_readdir_page_init_array(page, last_cookie, > > > 0); > > > return page; > > > } > > > > > > @@ -304,19 +306,44 @@ int nfs_readdir_add_to_array(struct > > > nfs_entry > > > *entry, struct page *page) > > > return ret; > > > } > > > > > > +static bool nfs_readdir_page_cookie_match(struct page *page, u64 > > > last_cookie, > > > + u64 change_attr) > > > > How about "nfs_readdir_page_valid()"? There's more going on than a > > cookie match. > > > > > > > +{ > > > + struct nfs_cache_array *array = kmap_atomic(page); > > > + int ret = true; > > > + > > > + if (array->change_attr != change_attr) > > > + ret = false; > > > > Can we skip the next test if ret = false? > > I'd expect the compiler to do that. > > > > > > + if (array->size > 0 && array->array[0].cookie != > > > last_cookie) > > > + ret = false; > > > + kunmap_atomic(array); > > > + return ret; > > > +} > > > + > > > +static void nfs_readdir_page_unlock_and_put(struct page *page) > > > +{ > > > + unlock_page(page); > > > + put_page(page); > > > +} > > > + > > > static struct page *nfs_readdir_page_get_locked(struct > > > address_space > > > *mapping, > > > pgoff_t index, > > > u64 > > > last_cookie) > > > { > > > struct page *page; > > > + u64 change_attr; > > > > > > page = grab_cache_page(mapping, index); > > > - if (page && !PageUptodate(page)) { > > > - nfs_readdir_page_init_array(page, last_cookie); > > > - if (invalidate_inode_pages2_range(mapping, index > > > + > > > 1, -1) < 0) > > > - nfs_zap_mapping(mapping->host, mapping); > > > - SetPageUptodate(page); > > > + if (!page) > > > + return NULL; > > > + change_attr = inode_peek_iversion_raw(mapping->host); > > > + if (PageUptodate(page)) { > > > + if (nfs_readdir_page_cookie_match(page, > > > last_cookie, > > > + change_attr)) > > > + return page; > > > + nfs_readdir_clear_array(page); > > > > > > Why use i_version rather than nfs_save_change_attribute? Seems > > having a > > consistent value across the pachecache and dir_verifiers would help > > debugging, and we've already have a bunch of machinery around the > > change_attribute. > > The directory cache_change_attribute is not reported in tracepoints > because it is a directory-specific field, so it's not as useful for > debugging. > > The inode change attribute is what we have traditionally used for > determining cache consistency, and when to invalidate the cache. I should probably elaborate a little further on the differences between the inode change attribute and the cache_change_attribute. One of the main reasons for introducing the latter was to have something that allows us to track changes to the directory, but to avoid forcing unnecessary revalidations of the dcache. What this means is that when we create or remove a file, and the pre/post-op attributes tell us that there were no third party changes to the directory, we update the dcache, but we do _not_ update the cache_change_attribute, because we know that the rest of the directory contents are valid, and so we don't have to revalidate the dentries. However in that case, we _do_ want to update the readdir cache to reflect the fact that an entry was added or deleted. While we could figure out how to remove an entry (at least for the case where the filesystem is case-sensitive), we do not know where the filesystem added the new file, or what cookies was assigned. This is why the inode change attribute is more appropriate for indexing the page cache pages. It reflects the cases where we want to revalidate the readdir cache, as opposed to the dcache.
On 24 Feb 2022, at 22:51, Trond Myklebust wrote: > On Fri, 2022-02-25 at 02:26 +0000, Trond Myklebust wrote: >> On Thu, 2022-02-24 at 09:53 -0500, Benjamin Coddington wrote: >>> On 23 Feb 2022, at 16:12, trondmy@kernel.org wrote: >>> >>>> From: Trond Myklebust <trond.myklebust@hammerspace.com> >>>> >>>> Use the change attribute and the first cookie in a directory page >>>> cache >>>> entry to validate that the page is up to date. >>>> >>>> Suggested-by: Benjamin Coddington <bcodding@redhat.com> >>>> Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com> >>>> --- >>>> fs/nfs/dir.c | 68 >>>> ++++++++++++++++++++++++++++------------------------ >>>> 1 file changed, 37 insertions(+), 31 deletions(-) >>>> >>>> diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c >>>> index f2258e926df2..5d9367d9b651 100644 >>>> --- a/fs/nfs/dir.c >>>> +++ b/fs/nfs/dir.c >>>> @@ -139,6 +139,7 @@ struct nfs_cache_array_entry { >>>> }; >>>> >>>> struct nfs_cache_array { >>>> + u64 change_attr; >>>> u64 last_cookie; >>>> unsigned int size; >>>> unsigned char page_full : 1, >>>> @@ -175,7 +176,8 @@ static void nfs_readdir_array_init(struct >>>> nfs_cache_array *array) >>>> memset(array, 0, sizeof(struct nfs_cache_array)); >>>> } >>>> >>>> -static void nfs_readdir_page_init_array(struct page *page, u64 >>>> last_cookie) >>>> +static void nfs_readdir_page_init_array(struct page *page, u64 >>>> last_cookie, >>>> + u64 >>>> change_attr) >>>> { >>>> struct nfs_cache_array *array; >>> >>> >>> There's a hunk missing here, something like: >>> >>> @@ -185,6 +185,7 @@ static void nfs_readdir_page_init_array(struct >>> page >>> *page, u64 last_cookie, >>> nfs_readdir_array_init(array); >>> array->last_cookie = last_cookie; >>> array->cookies_are_ordered = 1; >>> + array->change_attr = change_attr; >>> kunmap_atomic(array); >>> } >>> >>>> >>>> @@ -207,7 +209,7 @@ nfs_readdir_page_array_alloc(u64 last_cookie, >>>> gfp_t gfp_flags) >>>> { >>>> struct page *page = alloc_page(gfp_flags); >>>> if (page) >>>> - nfs_readdir_page_init_array(page, >>>> last_cookie); >>>> + nfs_readdir_page_init_array(page, >>>> last_cookie, >>>> 0); >>>> return page; >>>> } >>>> >>>> @@ -304,19 +306,44 @@ int nfs_readdir_add_to_array(struct >>>> nfs_entry >>>> *entry, struct page *page) >>>> return ret; >>>> } >>>> >>>> +static bool nfs_readdir_page_cookie_match(struct page *page, u64 >>>> last_cookie, >>>> + >>>> u64 change_attr) >>> >>> How about "nfs_readdir_page_valid()"? There's more going on than a >>> cookie match. >>> >>> >>>> +{ >>>> + struct nfs_cache_array *array = kmap_atomic(page); >>>> + int ret = true; >>>> + >>>> + if (array->change_attr != change_attr) >>>> + ret = false; >>> >>> Can we skip the next test if ret = false? >> >> I'd expect the compiler to do that. >> >>> >>>> + if (array->size > 0 && array->array[0].cookie != >>>> last_cookie) >>>> + ret = false; >>>> + kunmap_atomic(array); >>>> + return ret; >>>> +} >>>> + >>>> +static void nfs_readdir_page_unlock_and_put(struct page *page) >>>> +{ >>>> + unlock_page(page); >>>> + put_page(page); >>>> +} >>>> + >>>> static struct page *nfs_readdir_page_get_locked(struct >>>> address_space >>>> *mapping, >>>> pgoff_t >>>> index, >>>> u64 >>>> last_cookie) >>>> { >>>> struct page *page; >>>> + u64 change_attr; >>>> >>>> page = grab_cache_page(mapping, index); >>>> - if (page && !PageUptodate(page)) { >>>> - nfs_readdir_page_init_array(page, >>>> last_cookie); >>>> - if >>>> (invalidate_inode_pages2_range(mapping, index >>>> + >>>> 1, -1) < 0) >>>> - nfs_zap_mapping(mapping->host, >>>> mapping); >>>> - SetPageUptodate(page); >>>> + if (!page) >>>> + return NULL; >>>> + change_attr = >>>> inode_peek_iversion_raw(mapping->host); >>>> + if (PageUptodate(page)) { >>>> + if >>>> (nfs_readdir_page_cookie_match(page, >>>> last_cookie, >>>> + >>>> change_attr)) >>>> + return page; >>>> + nfs_readdir_clear_array(page); >>> >>> >>> Why use i_version rather than nfs_save_change_attribute? Seems >>> having a >>> consistent value across the pachecache and dir_verifiers would help >>> debugging, and we've already have a bunch of machinery around the >>> change_attribute. >> >> The directory cache_change_attribute is not reported in tracepoints >> because it is a directory-specific field, so it's not as useful for >> debugging. >> >> The inode change attribute is what we have traditionally used for >> determining cache consistency, and when to invalidate the cache. > > I should probably elaborate a little further on the differences > between > the inode change attribute and the cache_change_attribute. > > One of the main reasons for introducing the latter was to have > something that allows us to track changes to the directory, but to > avoid forcing unnecessary revalidations of the dcache. > > What this means is that when we create or remove a file, and the > pre/post-op attributes tell us that there were no third party changes > to the directory, we update the dcache, but we do _not_ update the > cache_change_attribute, because we know that the rest of the directory > contents are valid, and so we don't have to revalidate the dentries. > However in that case, we _do_ want to update the readdir cache to > reflect the fact that an entry was added or deleted. While we could > figure out how to remove an entry (at least for the case where the > filesystem is case-sensitive), we do not know where the filesystem > added the new file, or what cookies was assigned. > > This is why the inode change attribute is more appropriate for > indexing > the page cache pages. It reflects the cases where we want to > revalidate > the readdir cache, as opposed to the dcache. Ok, thanks for explaining this. I've noticed that you haven't responded about my concerns about not checking the directory for changes with every v4 READDIR. For v3, we have post-op updates to the directory, but with v4 the directory can change and we'll end up with entries in the cache that are marked with an old change_attr. I'm pretty positive that not checking for changes to the directory (not sending GETATTR with READDIR) is going to create cases of double-listed and truncated-listings for dirctory listers. Not handling those cases means I'm going to have some very unhappy customers that complain about their files disappearing/reappearing on NFS. If you need me to prove that its an issue, I can take the time to write up program that shows this problem. Ben
On Fri, 2022-02-25 at 06:38 -0500, Benjamin Coddington wrote: > On 24 Feb 2022, at 22:51, Trond Myklebust wrote: > > > On Fri, 2022-02-25 at 02:26 +0000, Trond Myklebust wrote: > > > On Thu, 2022-02-24 at 09:53 -0500, Benjamin Coddington wrote: > > > > On 23 Feb 2022, at 16:12, trondmy@kernel.org wrote: > > > > > > > > > From: Trond Myklebust <trond.myklebust@hammerspace.com> > > > > > > > > > > Use the change attribute and the first cookie in a directory > > > > > page > > > > > cache > > > > > entry to validate that the page is up to date. > > > > > > > > > > Suggested-by: Benjamin Coddington <bcodding@redhat.com> > > > > > Signed-off-by: Trond Myklebust > > > > > <trond.myklebust@hammerspace.com> > > > > > --- > > > > > fs/nfs/dir.c | 68 > > > > > ++++++++++++++++++++++++++++------------------------ > > > > > 1 file changed, 37 insertions(+), 31 deletions(-) > > > > > > > > > > diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c > > > > > index f2258e926df2..5d9367d9b651 100644 > > > > > --- a/fs/nfs/dir.c > > > > > +++ b/fs/nfs/dir.c > > > > > @@ -139,6 +139,7 @@ struct nfs_cache_array_entry { > > > > > }; > > > > > > > > > > struct nfs_cache_array { > > > > > + u64 change_attr; > > > > > u64 last_cookie; > > > > > unsigned int size; > > > > > unsigned char page_full : 1, > > > > > @@ -175,7 +176,8 @@ static void nfs_readdir_array_init(struct > > > > > nfs_cache_array *array) > > > > > memset(array, 0, sizeof(struct nfs_cache_array)); > > > > > } > > > > > > > > > > -static void nfs_readdir_page_init_array(struct page *page, > > > > > u64 > > > > > last_cookie) > > > > > +static void nfs_readdir_page_init_array(struct page *page, > > > > > u64 > > > > > last_cookie, > > > > > + u64 > > > > > change_attr) > > > > > { > > > > > struct nfs_cache_array *array; > > > > > > > > > > > > There's a hunk missing here, something like: > > > > > > > > @@ -185,6 +185,7 @@ static void > > > > nfs_readdir_page_init_array(struct > > > > page > > > > *page, u64 last_cookie, > > > > nfs_readdir_array_init(array); > > > > array->last_cookie = last_cookie; > > > > array->cookies_are_ordered = 1; > > > > + array->change_attr = change_attr; > > > > kunmap_atomic(array); > > > > } > > > > > > > > > > > > > > @@ -207,7 +209,7 @@ nfs_readdir_page_array_alloc(u64 > > > > > last_cookie, > > > > > gfp_t gfp_flags) > > > > > { > > > > > struct page *page = alloc_page(gfp_flags); > > > > > if (page) > > > > > - nfs_readdir_page_init_array(page, > > > > > last_cookie); > > > > > + nfs_readdir_page_init_array(page, > > > > > last_cookie, > > > > > 0); > > > > > return page; > > > > > } > > > > > > > > > > @@ -304,19 +306,44 @@ int nfs_readdir_add_to_array(struct > > > > > nfs_entry > > > > > *entry, struct page *page) > > > > > return ret; > > > > > } > > > > > > > > > > +static bool nfs_readdir_page_cookie_match(struct page *page, > > > > > u64 > > > > > last_cookie, > > > > > + > > > > > u64 change_attr) > > > > > > > > How about "nfs_readdir_page_valid()"? There's more going on > > > > than a > > > > cookie match. > > > > > > > > > > > > > +{ > > > > > + struct nfs_cache_array *array = kmap_atomic(page); > > > > > + int ret = true; > > > > > + > > > > > + if (array->change_attr != change_attr) > > > > > + ret = false; > > > > > > > > Can we skip the next test if ret = false? > > > > > > I'd expect the compiler to do that. > > > > > > > > > > > > + if (array->size > 0 && array->array[0].cookie != > > > > > last_cookie) > > > > > + ret = false; > > > > > + kunmap_atomic(array); > > > > > + return ret; > > > > > +} > > > > > + > > > > > +static void nfs_readdir_page_unlock_and_put(struct page > > > > > *page) > > > > > +{ > > > > > + unlock_page(page); > > > > > + put_page(page); > > > > > +} > > > > > + > > > > > static struct page *nfs_readdir_page_get_locked(struct > > > > > address_space > > > > > *mapping, > > > > > pgoff_t > > > > > index, > > > > > u64 > > > > > last_cookie) > > > > > { > > > > > struct page *page; > > > > > + u64 change_attr; > > > > > > > > > > page = grab_cache_page(mapping, index); > > > > > - if (page && !PageUptodate(page)) { > > > > > - nfs_readdir_page_init_array(page, > > > > > last_cookie); > > > > > - if > > > > > (invalidate_inode_pages2_range(mapping, index > > > > > + > > > > > 1, -1) < 0) > > > > > - nfs_zap_mapping(mapping->host, > > > > > mapping); > > > > > - SetPageUptodate(page); > > > > > + if (!page) > > > > > + return NULL; > > > > > + change_attr = > > > > > inode_peek_iversion_raw(mapping->host); > > > > > + if (PageUptodate(page)) { > > > > > + if > > > > > (nfs_readdir_page_cookie_match(page, > > > > > last_cookie, > > > > > + > > > > > change_attr)) > > > > > + return page; > > > > > + nfs_readdir_clear_array(page); > > > > > > > > > > > > Why use i_version rather than nfs_save_change_attribute? Seems > > > > having a > > > > consistent value across the pachecache and dir_verifiers would > > > > help > > > > debugging, and we've already have a bunch of machinery around > > > > the > > > > change_attribute. > > > > > > The directory cache_change_attribute is not reported in > > > tracepoints > > > because it is a directory-specific field, so it's not as useful > > > for > > > debugging. > > > > > > The inode change attribute is what we have traditionally used for > > > determining cache consistency, and when to invalidate the cache. > > > > I should probably elaborate a little further on the differences > > between > > the inode change attribute and the cache_change_attribute. > > > > One of the main reasons for introducing the latter was to have > > something that allows us to track changes to the directory, but to > > avoid forcing unnecessary revalidations of the dcache. > > > > What this means is that when we create or remove a file, and the > > pre/post-op attributes tell us that there were no third party > > changes > > to the directory, we update the dcache, but we do _not_ update the > > cache_change_attribute, because we know that the rest of the > > directory > > contents are valid, and so we don't have to revalidate the > > dentries. > > However in that case, we _do_ want to update the readdir cache to > > reflect the fact that an entry was added or deleted. While we could > > figure out how to remove an entry (at least for the case where the > > filesystem is case-sensitive), we do not know where the filesystem > > added the new file, or what cookies was assigned. > > > > This is why the inode change attribute is more appropriate for > > indexing > > the page cache pages. It reflects the cases where we want to > > revalidate > > the readdir cache, as opposed to the dcache. > > Ok, thanks for explaining this. > > I've noticed that you haven't responded about my concerns about not > checking > the directory for changes with every v4 READDIR. For v3, we have > post-op > updates to the directory, but with v4 the directory can change and > we'll > end up with entries in the cache that are marked with an old > change_attr. > Then they will be rejected by nfs_readdir_page_cookie_match() if a user looks up that page again after we've revalidated the change attribute on the directory. ...and note that NFSv4 does returns a struct change_info4 for all operations that change the directory, so we will update the change attribute in all those cases. If the change is made on the server, well then we will detect it through the standard revalidation process that usually decides when to invalidate the directory page cache. > I'm pretty positive that not checking for changes to the directory > (not > sending GETATTR with READDIR) is going to create cases of double- > listed > and > truncated-listings for dirctory listers. Not handling those cases > means > I'm > going to have some very unhappy customers that complain about their > files > disappearing/reappearing on NFS. > > If you need me to prove that its an issue, I can take the time to > write > up > program that shows this problem. > If you label the page contents with an attribute that was retrieved _after_ the READDIR op, then you will introduce this as a problem for your customers. The reason is that there is no atomicity between operations in a COMPOUND. Worse, the implementation of readdir in scalable modern systems, including Linux, does not even guarantee atomicity of the readdir operation itself. Instead each readdir entry is filled without holding any locks or preventing any changes to the directory or to the object itself. POSIX states very explicitly that if you're making changes to the directory after the call to opendir() or rewinddir(), then the behaviour w.r.t. whether that file appears in the readdir() call is unspecified. See https://pubs.opengroup.org/onlinepubs/9699919799/functions/readdir.html This is also consistent with how glibc caches the results of a getdents() call.
On Fri, 2022-02-25 at 13:10 +0000, Trond Myklebust wrote: > On Fri, 2022-02-25 at 06:38 -0500, Benjamin Coddington wrote: > > On 24 Feb 2022, at 22:51, Trond Myklebust wrote: > > > > > On Fri, 2022-02-25 at 02:26 +0000, Trond Myklebust wrote: > > > > On Thu, 2022-02-24 at 09:53 -0500, Benjamin Coddington wrote: > > > > > On 23 Feb 2022, at 16:12, trondmy@kernel.org wrote: > > > > > > > > > > > From: Trond Myklebust <trond.myklebust@hammerspace.com> > > > > > > > > > > > > Use the change attribute and the first cookie in a > > > > > > directory > > > > > > page > > > > > > cache > > > > > > entry to validate that the page is up to date. > > > > > > > > > > > > Suggested-by: Benjamin Coddington <bcodding@redhat.com> > > > > > > Signed-off-by: Trond Myklebust > > > > > > <trond.myklebust@hammerspace.com> > > > > > > --- > > > > > > fs/nfs/dir.c | 68 > > > > > > ++++++++++++++++++++++++++++------------------------ > > > > > > 1 file changed, 37 insertions(+), 31 deletions(-) > > > > > > > > > > > > diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c > > > > > > index f2258e926df2..5d9367d9b651 100644 > > > > > > --- a/fs/nfs/dir.c > > > > > > +++ b/fs/nfs/dir.c > > > > > > @@ -139,6 +139,7 @@ struct nfs_cache_array_entry { > > > > > > }; > > > > > > > > > > > > struct nfs_cache_array { > > > > > > + u64 change_attr; > > > > > > u64 last_cookie; > > > > > > unsigned int size; > > > > > > unsigned char page_full : 1, > > > > > > @@ -175,7 +176,8 @@ static void > > > > > > nfs_readdir_array_init(struct > > > > > > nfs_cache_array *array) > > > > > > memset(array, 0, sizeof(struct nfs_cache_array)); > > > > > > } > > > > > > > > > > > > -static void nfs_readdir_page_init_array(struct page *page, > > > > > > u64 > > > > > > last_cookie) > > > > > > +static void nfs_readdir_page_init_array(struct page *page, > > > > > > u64 > > > > > > last_cookie, > > > > > > + u64 > > > > > > change_attr) > > > > > > { > > > > > > struct nfs_cache_array *array; > > > > > > > > > > > > > > > There's a hunk missing here, something like: > > > > > > > > > > @@ -185,6 +185,7 @@ static void > > > > > nfs_readdir_page_init_array(struct > > > > > page > > > > > *page, u64 last_cookie, > > > > > nfs_readdir_array_init(array); > > > > > array->last_cookie = last_cookie; > > > > > array->cookies_are_ordered = 1; > > > > > + array->change_attr = change_attr; > > > > > kunmap_atomic(array); > > > > > } > > > > > > > > > > > > > > > > > @@ -207,7 +209,7 @@ nfs_readdir_page_array_alloc(u64 > > > > > > last_cookie, > > > > > > gfp_t gfp_flags) > > > > > > { > > > > > > struct page *page = alloc_page(gfp_flags); > > > > > > if (page) > > > > > > - nfs_readdir_page_init_array(page, > > > > > > last_cookie); > > > > > > + nfs_readdir_page_init_array(page, > > > > > > last_cookie, > > > > > > 0); > > > > > > return page; > > > > > > } > > > > > > > > > > > > @@ -304,19 +306,44 @@ int nfs_readdir_add_to_array(struct > > > > > > nfs_entry > > > > > > *entry, struct page *page) > > > > > > return ret; > > > > > > } > > > > > > > > > > > > +static bool nfs_readdir_page_cookie_match(struct page > > > > > > *page, > > > > > > u64 > > > > > > last_cookie, > > > > > > + > > > > > > u64 change_attr) > > > > > > > > > > How about "nfs_readdir_page_valid()"? There's more going on > > > > > than a > > > > > cookie match. > > > > > > > > > > > > > > > > +{ > > > > > > + struct nfs_cache_array *array = kmap_atomic(page); > > > > > > + int ret = true; > > > > > > + > > > > > > + if (array->change_attr != change_attr) > > > > > > + ret = false; > > > > > > > > > > Can we skip the next test if ret = false? > > > > > > > > I'd expect the compiler to do that. > > > > > > > > > > > > > > > + if (array->size > 0 && array->array[0].cookie != > > > > > > last_cookie) > > > > > > + ret = false; > > > > > > + kunmap_atomic(array); > > > > > > + return ret; > > > > > > +} > > > > > > + > > > > > > +static void nfs_readdir_page_unlock_and_put(struct page > > > > > > *page) > > > > > > +{ > > > > > > + unlock_page(page); > > > > > > + put_page(page); > > > > > > +} > > > > > > + > > > > > > static struct page *nfs_readdir_page_get_locked(struct > > > > > > address_space > > > > > > *mapping, > > > > > > pgoff_t > > > > > > index, > > > > > > u64 > > > > > > last_cookie) > > > > > > { > > > > > > struct page *page; > > > > > > + u64 change_attr; > > > > > > > > > > > > page = grab_cache_page(mapping, index); > > > > > > - if (page && !PageUptodate(page)) { > > > > > > - nfs_readdir_page_init_array(page, > > > > > > last_cookie); > > > > > > - if > > > > > > (invalidate_inode_pages2_range(mapping, index > > > > > > + > > > > > > 1, -1) < 0) > > > > > > - nfs_zap_mapping(mapping->host, > > > > > > mapping); > > > > > > - SetPageUptodate(page); > > > > > > + if (!page) > > > > > > + return NULL; > > > > > > + change_attr = > > > > > > inode_peek_iversion_raw(mapping->host); > > > > > > + if (PageUptodate(page)) { > > > > > > + if > > > > > > (nfs_readdir_page_cookie_match(page, > > > > > > last_cookie, > > > > > > + > > > > > > change_attr)) > > > > > > + return page; > > > > > > + nfs_readdir_clear_array(page); > > > > > > > > > > > > > > > Why use i_version rather than nfs_save_change_attribute? > > > > > Seems > > > > > having a > > > > > consistent value across the pachecache and dir_verifiers > > > > > would > > > > > help > > > > > debugging, and we've already have a bunch of machinery around > > > > > the > > > > > change_attribute. > > > > > > > > The directory cache_change_attribute is not reported in > > > > tracepoints > > > > because it is a directory-specific field, so it's not as useful > > > > for > > > > debugging. > > > > > > > > The inode change attribute is what we have traditionally used > > > > for > > > > determining cache consistency, and when to invalidate the > > > > cache. > > > > > > I should probably elaborate a little further on the differences > > > between > > > the inode change attribute and the cache_change_attribute. > > > > > > One of the main reasons for introducing the latter was to have > > > something that allows us to track changes to the directory, but > > > to > > > avoid forcing unnecessary revalidations of the dcache. > > > > > > What this means is that when we create or remove a file, and the > > > pre/post-op attributes tell us that there were no third party > > > changes > > > to the directory, we update the dcache, but we do _not_ update > > > the > > > cache_change_attribute, because we know that the rest of the > > > directory > > > contents are valid, and so we don't have to revalidate the > > > dentries. > > > However in that case, we _do_ want to update the readdir cache to > > > reflect the fact that an entry was added or deleted. While we > > > could > > > figure out how to remove an entry (at least for the case where > > > the > > > filesystem is case-sensitive), we do not know where the > > > filesystem > > > added the new file, or what cookies was assigned. > > > > > > This is why the inode change attribute is more appropriate for > > > indexing > > > the page cache pages. It reflects the cases where we want to > > > revalidate > > > the readdir cache, as opposed to the dcache. > > > > Ok, thanks for explaining this. > > > > I've noticed that you haven't responded about my concerns about not > > checking > > the directory for changes with every v4 READDIR. For v3, we have > > post-op > > updates to the directory, but with v4 the directory can change and > > we'll > > end up with entries in the cache that are marked with an old > > change_attr. > > > > Then they will be rejected by nfs_readdir_page_cookie_match() if a > user > looks up that page again after we've revalidated the change attribute > on the directory. > > ...and note that NFSv4 does returns a struct change_info4 for all > operations that change the directory, so we will update the change > attribute in all those cases. > > If the change is made on the server, well then we will detect it > through the standard revalidation process that usually decides when > to > invalidate the directory page cache. > > > I'm pretty positive that not checking for changes to the directory > > (not > > sending GETATTR with READDIR) is going to create cases of double- > > listed > > and > > truncated-listings for dirctory listers. Not handling those cases > > means > > I'm > > going to have some very unhappy customers that complain about their > > files > > disappearing/reappearing on NFS. > > > > If you need me to prove that its an issue, I can take the time to > > write > > up > > program that shows this problem. > > > > If you label the page contents with an attribute that was retrieved > _after_ the READDIR op, then you will introduce this as a problem for > your customers. > > The reason is that there is no atomicity between operations in a > COMPOUND. Worse, the implementation of readdir in scalable modern > systems, including Linux, does not even guarantee atomicity of the > readdir operation itself. Instead each readdir entry is filled > without > holding any locks or preventing any changes to the directory or to > the > object itself. > > POSIX states very explicitly that if you're making changes to the > directory after the call to opendir() or rewinddir(), then the > behaviour w.r.t. whether that file appears in the readdir() call is > unspecified. See > https://pubs.opengroup.org/onlinepubs/9699919799/functions/readdir.html > > This is also consistent with how glibc caches the results of a > getdents() call. > Ah, wait a minute... There is a problem with the call to nfs_readdir_page_get_next(). It will allocate the page _after_ the readdir call itself, and so might label it with a newer change attribute... I'll fix that so we can pass in the change attribute associated with the readdir call.
On 25 Feb 2022, at 8:10, Trond Myklebust wrote: > On Fri, 2022-02-25 at 06:38 -0500, Benjamin Coddington wrote: >> On 24 Feb 2022, at 22:51, Trond Myklebust wrote: >> >>> On Fri, 2022-02-25 at 02:26 +0000, Trond Myklebust wrote: >>>> On Thu, 2022-02-24 at 09:53 -0500, Benjamin Coddington wrote: >>>>> On 23 Feb 2022, at 16:12, trondmy@kernel.org wrote: >>>>> >>>>>> From: Trond Myklebust <trond.myklebust@hammerspace.com> >>>>>> >>>>>> Use the change attribute and the first cookie in a directory >>>>>> page >>>>>> cache >>>>>> entry to validate that the page is up to date. >>>>>> >>>>>> Suggested-by: Benjamin Coddington <bcodding@redhat.com> >>>>>> Signed-off-by: Trond Myklebust >>>>>> <trond.myklebust@hammerspace.com> >>>>>> --- >>>>>> fs/nfs/dir.c | 68 >>>>>> ++++++++++++++++++++++++++++------------------------ >>>>>> 1 file changed, 37 insertions(+), 31 deletions(-) >>>>>> >>>>>> diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c >>>>>> index f2258e926df2..5d9367d9b651 100644 >>>>>> --- a/fs/nfs/dir.c >>>>>> +++ b/fs/nfs/dir.c >>>>>> @@ -139,6 +139,7 @@ struct nfs_cache_array_entry { >>>>>> }; >>>>>> >>>>>> struct nfs_cache_array { >>>>>> + u64 change_attr; >>>>>> u64 last_cookie; >>>>>> unsigned int size; >>>>>> unsigned char page_full : 1, >>>>>> @@ -175,7 +176,8 @@ static void nfs_readdir_array_init(struct >>>>>> nfs_cache_array *array) >>>>>> memset(array, 0, sizeof(struct nfs_cache_array)); >>>>>> } >>>>>> >>>>>> -static void nfs_readdir_page_init_array(struct page *page, >>>>>> u64 >>>>>> last_cookie) >>>>>> +static void nfs_readdir_page_init_array(struct page *page, >>>>>> u64 >>>>>> last_cookie, >>>>>> + u64 >>>>>> change_attr) >>>>>> { >>>>>> struct nfs_cache_array *array; >>>>> >>>>> >>>>> There's a hunk missing here, something like: >>>>> >>>>> @@ -185,6 +185,7 @@ static void >>>>> nfs_readdir_page_init_array(struct >>>>> page >>>>> *page, u64 last_cookie, >>>>> nfs_readdir_array_init(array); >>>>> array->last_cookie = last_cookie; >>>>> array->cookies_are_ordered = 1; >>>>> + array->change_attr = change_attr; >>>>> kunmap_atomic(array); >>>>> } >>>>> >>>>>> >>>>>> @@ -207,7 +209,7 @@ nfs_readdir_page_array_alloc(u64 >>>>>> last_cookie, >>>>>> gfp_t gfp_flags) >>>>>> { >>>>>> struct page *page = alloc_page(gfp_flags); >>>>>> if (page) >>>>>> - nfs_readdir_page_init_array(page, >>>>>> last_cookie); >>>>>> + nfs_readdir_page_init_array(page, >>>>>> last_cookie, >>>>>> 0); >>>>>> return page; >>>>>> } >>>>>> >>>>>> @@ -304,19 +306,44 @@ int nfs_readdir_add_to_array(struct >>>>>> nfs_entry >>>>>> *entry, struct page *page) >>>>>> return ret; >>>>>> } >>>>>> >>>>>> +static bool nfs_readdir_page_cookie_match(struct page *page, >>>>>> u64 >>>>>> last_cookie, >>>>>> + >>>>>> u64 change_attr) >>>>> >>>>> How about "nfs_readdir_page_valid()"? There's more going on >>>>> than a >>>>> cookie match. >>>>> >>>>> >>>>>> +{ >>>>>> + struct nfs_cache_array *array = kmap_atomic(page); >>>>>> + int ret = true; >>>>>> + >>>>>> + if (array->change_attr != change_attr) >>>>>> + ret = false; >>>>> >>>>> Can we skip the next test if ret = false? >>>> >>>> I'd expect the compiler to do that. >>>> >>>>> >>>>>> + if (array->size > 0 && array->array[0].cookie != >>>>>> last_cookie) >>>>>> + ret = false; >>>>>> + kunmap_atomic(array); >>>>>> + return ret; >>>>>> +} >>>>>> + >>>>>> +static void nfs_readdir_page_unlock_and_put(struct page >>>>>> *page) >>>>>> +{ >>>>>> + unlock_page(page); >>>>>> + put_page(page); >>>>>> +} >>>>>> + >>>>>> static struct page *nfs_readdir_page_get_locked(struct >>>>>> address_space >>>>>> *mapping, >>>>>> pgoff_t >>>>>> index, >>>>>> u64 >>>>>> last_cookie) >>>>>> { >>>>>> struct page *page; >>>>>> + u64 change_attr; >>>>>> >>>>>> page = grab_cache_page(mapping, index); >>>>>> - if (page && !PageUptodate(page)) { >>>>>> - nfs_readdir_page_init_array(page, >>>>>> last_cookie); >>>>>> - if >>>>>> (invalidate_inode_pages2_range(mapping, index >>>>>> + >>>>>> 1, -1) < 0) >>>>>> - nfs_zap_mapping(mapping->host, >>>>>> mapping); >>>>>> - SetPageUptodate(page); >>>>>> + if (!page) >>>>>> + return NULL; >>>>>> + change_attr = >>>>>> inode_peek_iversion_raw(mapping->host); >>>>>> + if (PageUptodate(page)) { >>>>>> + if >>>>>> (nfs_readdir_page_cookie_match(page, >>>>>> last_cookie, >>>>>> + >>>>>> change_attr)) >>>>>> + return page; >>>>>> + nfs_readdir_clear_array(page); >>>>> >>>>> >>>>> Why use i_version rather than nfs_save_change_attribute? Seems >>>>> having a >>>>> consistent value across the pachecache and dir_verifiers would >>>>> help >>>>> debugging, and we've already have a bunch of machinery around >>>>> the >>>>> change_attribute. >>>> >>>> The directory cache_change_attribute is not reported in >>>> tracepoints >>>> because it is a directory-specific field, so it's not as useful >>>> for >>>> debugging. >>>> >>>> The inode change attribute is what we have traditionally used for >>>> determining cache consistency, and when to invalidate the cache. >>> >>> I should probably elaborate a little further on the differences >>> between >>> the inode change attribute and the cache_change_attribute. >>> >>> One of the main reasons for introducing the latter was to have >>> something that allows us to track changes to the directory, but to >>> avoid forcing unnecessary revalidations of the dcache. >>> >>> What this means is that when we create or remove a file, and the >>> pre/post-op attributes tell us that there were no third party >>> changes >>> to the directory, we update the dcache, but we do _not_ update the >>> cache_change_attribute, because we know that the rest of the >>> directory >>> contents are valid, and so we don't have to revalidate the >>> dentries. >>> However in that case, we _do_ want to update the readdir cache to >>> reflect the fact that an entry was added or deleted. While we could >>> figure out how to remove an entry (at least for the case where the >>> filesystem is case-sensitive), we do not know where the filesystem >>> added the new file, or what cookies was assigned. >>> >>> This is why the inode change attribute is more appropriate for >>> indexing >>> the page cache pages. It reflects the cases where we want to >>> revalidate >>> the readdir cache, as opposed to the dcache. >> >> Ok, thanks for explaining this. >> >> I've noticed that you haven't responded about my concerns about not >> checking >> the directory for changes with every v4 READDIR. For v3, we have >> post-op >> updates to the directory, but with v4 the directory can change and >> we'll >> end up with entries in the cache that are marked with an old >> change_attr. >> > > Then they will be rejected by nfs_readdir_page_cookie_match() if a > user > looks up that page again after we've revalidated the change attribute > on the directory. > > ...and note that NFSv4 does returns a struct change_info4 for all > operations that change the directory, so we will update the change > attribute in all those cases. I'm not worried about changes from the same client. > If the change is made on the server, well then we will detect it > through the standard revalidation process that usually decides when to > invalidate the directory page cache. The environments I'm concerned about are setup very frequently: they look like multiple NFS clients co-ordinating on a directory with millions of files. Some clients are adding files as they do work, other clients are then looking for those files by walking the directory entries to validate their existence. The systems that do this have a "very bad time" if some of them produce listings that are _dramatically_ and transiently different from a listing they produced before. That can happen really easily with what we've got here, and it can create a huge problem for these setups. And it won't be easily reproduceable, and it will be hard to find. It will cost everyone involved a lot of time and effort to track down, and we can fix it easily. >> I'm pretty positive that not checking for changes to the directory >> (not >> sending GETATTR with READDIR) is going to create cases of double- >> listed >> and >> truncated-listings for dirctory listers. Not handling those cases >> means >> I'm >> going to have some very unhappy customers that complain about their >> files >> disappearing/reappearing on NFS. >> >> If you need me to prove that its an issue, I can take the time to >> write >> up >> program that shows this problem. >> > > If you label the page contents with an attribute that was retrieved > _after_ the READDIR op, then you will introduce this as a problem for > your customers. No the problem is already here, we're not introducing it. By labeling the page contents with every call we're shifting the race window from the client where it's a very large window to the server where the window is small. Its still possible, but *much* less likely. > The reason is that there is no atomicity between operations in a > COMPOUND. Worse, the implementation of readdir in scalable modern > systems, including Linux, does not even guarantee atomicity of the > readdir operation itself. Instead each readdir entry is filled without > holding any locks or preventing any changes to the directory or to the > object itself. I understand all this, but its not a reason to make the problem worse. > POSIX states very explicitly that if you're making changes to the > directory after the call to opendir() or rewinddir(), then the > behaviour w.r.t. whether that file appears in the readdir() call is > unspecified. See > https://pubs.opengroup.org/onlinepubs/9699919799/functions/readdir.html Yes, but again - just because the problem exists doesn't give us reason to amplify it when we can easily make a better choice for almost no cost. Here are my reasons for wanting the GETATTR added: - it makes it *much* less likely for this problem to occur, with the minor downside of decreased caching for unstable directories. - it makes v3 and v4 readdir pagecache behavior consistent WRT changing directories. I spent a non-trivial amount of time working on this problem, and saw this exact issue appear. Its definitely something that's going to come back and bite us if we don't fix it. How can I convince you? I've offered to produce a working example of this problem. Will you review those results? If I cannot convince you, I feel I'll have to pursue distro-specific changes for this work. Ben
On Fri, 2022-02-25 at 09:44 -0500, Benjamin Coddington wrote: > On 25 Feb 2022, at 8:10, Trond Myklebust wrote: > > > On Fri, 2022-02-25 at 06:38 -0500, Benjamin Coddington wrote: > > > On 24 Feb 2022, at 22:51, Trond Myklebust wrote: > > > > > > > On Fri, 2022-02-25 at 02:26 +0000, Trond Myklebust wrote: > > > > > On Thu, 2022-02-24 at 09:53 -0500, Benjamin Coddington wrote: > > > > > > On 23 Feb 2022, at 16:12, trondmy@kernel.org wrote: > > > > > > > > > > > > > From: Trond Myklebust <trond.myklebust@hammerspace.com> > > > > > > > > > > > > > > Use the change attribute and the first cookie in a > > > > > > > directory > > > > > > > page > > > > > > > cache > > > > > > > entry to validate that the page is up to date. > > > > > > > > > > > > > > Suggested-by: Benjamin Coddington <bcodding@redhat.com> > > > > > > > Signed-off-by: Trond Myklebust > > > > > > > <trond.myklebust@hammerspace.com> > > > > > > > --- > > > > > > > fs/nfs/dir.c | 68 > > > > > > > ++++++++++++++++++++++++++++------------------------ > > > > > > > 1 file changed, 37 insertions(+), 31 deletions(-) > > > > > > > > > > > > > > diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c > > > > > > > index f2258e926df2..5d9367d9b651 100644 > > > > > > > --- a/fs/nfs/dir.c > > > > > > > +++ b/fs/nfs/dir.c > > > > > > > @@ -139,6 +139,7 @@ struct nfs_cache_array_entry { > > > > > > > }; > > > > > > > > > > > > > > struct nfs_cache_array { > > > > > > > + u64 change_attr; > > > > > > > u64 last_cookie; > > > > > > > unsigned int size; > > > > > > > unsigned char page_full : 1, > > > > > > > @@ -175,7 +176,8 @@ static void > > > > > > > nfs_readdir_array_init(struct > > > > > > > nfs_cache_array *array) > > > > > > > memset(array, 0, sizeof(struct nfs_cache_array)); > > > > > > > } > > > > > > > > > > > > > > -static void nfs_readdir_page_init_array(struct page > > > > > > > *page, > > > > > > > u64 > > > > > > > last_cookie) > > > > > > > +static void nfs_readdir_page_init_array(struct page > > > > > > > *page, > > > > > > > u64 > > > > > > > last_cookie, > > > > > > > + u64 > > > > > > > change_attr) > > > > > > > { > > > > > > > struct nfs_cache_array *array; > > > > > > > > > > > > > > > > > > There's a hunk missing here, something like: > > > > > > > > > > > > @@ -185,6 +185,7 @@ static void > > > > > > nfs_readdir_page_init_array(struct > > > > > > page > > > > > > *page, u64 last_cookie, > > > > > > nfs_readdir_array_init(array); > > > > > > array->last_cookie = last_cookie; > > > > > > array->cookies_are_ordered = 1; > > > > > > + array->change_attr = change_attr; > > > > > > kunmap_atomic(array); > > > > > > } > > > > > > > > > > > > > > > > > > > > @@ -207,7 +209,7 @@ nfs_readdir_page_array_alloc(u64 > > > > > > > last_cookie, > > > > > > > gfp_t gfp_flags) > > > > > > > { > > > > > > > struct page *page = alloc_page(gfp_flags); > > > > > > > if (page) > > > > > > > - nfs_readdir_page_init_array(page, > > > > > > > last_cookie); > > > > > > > + nfs_readdir_page_init_array(page, > > > > > > > last_cookie, > > > > > > > 0); > > > > > > > return page; > > > > > > > } > > > > > > > > > > > > > > @@ -304,19 +306,44 @@ int nfs_readdir_add_to_array(struct > > > > > > > nfs_entry > > > > > > > *entry, struct page *page) > > > > > > > return ret; > > > > > > > } > > > > > > > > > > > > > > +static bool nfs_readdir_page_cookie_match(struct page > > > > > > > *page, > > > > > > > u64 > > > > > > > last_cookie, > > > > > > > + > > > > > > > u64 change_attr) > > > > > > > > > > > > How about "nfs_readdir_page_valid()"? There's more going > > > > > > on > > > > > > than a > > > > > > cookie match. > > > > > > > > > > > > > > > > > > > +{ > > > > > > > + struct nfs_cache_array *array = > > > > > > > kmap_atomic(page); > > > > > > > + int ret = true; > > > > > > > + > > > > > > > + if (array->change_attr != change_attr) > > > > > > > + ret = false; > > > > > > > > > > > > Can we skip the next test if ret = false? > > > > > > > > > > I'd expect the compiler to do that. > > > > > > > > > > > > > > > > > > + if (array->size > 0 && array->array[0].cookie != > > > > > > > last_cookie) > > > > > > > + ret = false; > > > > > > > + kunmap_atomic(array); > > > > > > > + return ret; > > > > > > > +} > > > > > > > + > > > > > > > +static void nfs_readdir_page_unlock_and_put(struct page > > > > > > > *page) > > > > > > > +{ > > > > > > > + unlock_page(page); > > > > > > > + put_page(page); > > > > > > > +} > > > > > > > + > > > > > > > static struct page *nfs_readdir_page_get_locked(struct > > > > > > > address_space > > > > > > > *mapping, > > > > > > > pgoff_t > > > > > > > index, > > > > > > > u64 > > > > > > > last_cookie) > > > > > > > { > > > > > > > struct page *page; > > > > > > > + u64 change_attr; > > > > > > > > > > > > > > page = grab_cache_page(mapping, index); > > > > > > > - if (page && !PageUptodate(page)) { > > > > > > > - nfs_readdir_page_init_array(page, > > > > > > > last_cookie); > > > > > > > - if > > > > > > > (invalidate_inode_pages2_range(mapping, index > > > > > > > + > > > > > > > 1, -1) < 0) > > > > > > > - nfs_zap_mapping(mapping->host, > > > > > > > mapping); > > > > > > > - SetPageUptodate(page); > > > > > > > + if (!page) > > > > > > > + return NULL; > > > > > > > + change_attr = > > > > > > > inode_peek_iversion_raw(mapping->host); > > > > > > > + if (PageUptodate(page)) { > > > > > > > + if > > > > > > > (nfs_readdir_page_cookie_match(page, > > > > > > > last_cookie, > > > > > > > + > > > > > > > change_attr)) > > > > > > > + return page; > > > > > > > + nfs_readdir_clear_array(page); > > > > > > > > > > > > > > > > > > Why use i_version rather than nfs_save_change_attribute? > > > > > > Seems > > > > > > having a > > > > > > consistent value across the pachecache and dir_verifiers > > > > > > would > > > > > > help > > > > > > debugging, and we've already have a bunch of machinery > > > > > > around > > > > > > the > > > > > > change_attribute. > > > > > > > > > > The directory cache_change_attribute is not reported in > > > > > tracepoints > > > > > because it is a directory-specific field, so it's not as > > > > > useful > > > > > for > > > > > debugging. > > > > > > > > > > The inode change attribute is what we have traditionally used > > > > > for > > > > > determining cache consistency, and when to invalidate the > > > > > cache. > > > > > > > > I should probably elaborate a little further on the differences > > > > between > > > > the inode change attribute and the cache_change_attribute. > > > > > > > > One of the main reasons for introducing the latter was to have > > > > something that allows us to track changes to the directory, but > > > > to > > > > avoid forcing unnecessary revalidations of the dcache. > > > > > > > > What this means is that when we create or remove a file, and > > > > the > > > > pre/post-op attributes tell us that there were no third party > > > > changes > > > > to the directory, we update the dcache, but we do _not_ update > > > > the > > > > cache_change_attribute, because we know that the rest of the > > > > directory > > > > contents are valid, and so we don't have to revalidate the > > > > dentries. > > > > However in that case, we _do_ want to update the readdir cache > > > > to > > > > reflect the fact that an entry was added or deleted. While we > > > > could > > > > figure out how to remove an entry (at least for the case where > > > > the > > > > filesystem is case-sensitive), we do not know where the > > > > filesystem > > > > added the new file, or what cookies was assigned. > > > > > > > > This is why the inode change attribute is more appropriate for > > > > indexing > > > > the page cache pages. It reflects the cases where we want to > > > > revalidate > > > > the readdir cache, as opposed to the dcache. > > > > > > Ok, thanks for explaining this. > > > > > > I've noticed that you haven't responded about my concerns about > > > not > > > checking > > > the directory for changes with every v4 READDIR. For v3, we have > > > post-op > > > updates to the directory, but with v4 the directory can change > > > and > > > we'll > > > end up with entries in the cache that are marked with an old > > > change_attr. > > > > > > > Then they will be rejected by nfs_readdir_page_cookie_match() if a > > user > > looks up that page again after we've revalidated the change > > attribute > > on the directory. > > > > ...and note that NFSv4 does returns a struct change_info4 for all > > operations that change the directory, so we will update the change > > attribute in all those cases. > > I'm not worried about changes from the same client. > > > If the change is made on the server, well then we will detect it > > through the standard revalidation process that usually decides when > > to > > invalidate the directory page cache. > > The environments I'm concerned about are setup very frequently: they > look > like multiple NFS clients co-ordinating on a directory with millions > of > files. Some clients are adding files as they do work, other clients > are > then looking for those files by walking the directory entries to > validate > their existence. The systems that do this have a "very bad time" if > some > of them produce listings that are _dramatically_ and transiently > different > from a listing they produced before. > > That can happen really easily with what we've got here, and it can > create a > huge problem for these setups. And it won't be easily reproduceable, > and it > will be hard to find. It will cost everyone involved a lot of time > and > effort to track down, and we can fix it easily. > > > > I'm pretty positive that not checking for changes to the > > > directory > > > (not > > > sending GETATTR with READDIR) is going to create cases of double- > > > listed > > > and > > > truncated-listings for dirctory listers. Not handling those > > > cases > > > means > > > I'm > > > going to have some very unhappy customers that complain about > > > their > > > files > > > disappearing/reappearing on NFS. > > > > > > If you need me to prove that its an issue, I can take the time to > > > write > > > up > > > program that shows this problem. > > > > > > > If you label the page contents with an attribute that was retrieved > > _after_ the READDIR op, then you will introduce this as a problem > > for > > your customers. > > No the problem is already here, we're not introducing it. By > labeling > the > page contents with every call we're shifting the race window from the > client > where it's a very large window to the server where the window is > small. > > Its still possible, but *much* less likely. > > > The reason is that there is no atomicity between operations in a > > COMPOUND. Worse, the implementation of readdir in scalable modern > > systems, including Linux, does not even guarantee atomicity of the > > readdir operation itself. Instead each readdir entry is filled > > without > > holding any locks or preventing any changes to the directory or to > > the > > object itself. > > I understand all this, but its not a reason to make the problem > worse. > > > POSIX states very explicitly that if you're making changes to the > > directory after the call to opendir() or rewinddir(), then the > > behaviour w.r.t. whether that file appears in the readdir() call is > > unspecified. See > > https://pubs.opengroup.org/onlinepubs/9699919799/functions/readdir.html > > Yes, but again - just because the problem exists doesn't give us > reason > to > amplify it when we can easily make a better choice for almost no > cost. > > Here are my reasons for wanting the GETATTR added: > - it makes it *much* less likely for this problem to occur, with > the > minor > downside of decreased caching for unstable directories. > - it makes v3 and v4 readdir pagecache behavior consistent WRT > changing > directories. > > I spent a non-trivial amount of time working on this problem, and saw > this > exact issue appear. Its definitely something that's going to come > back > and > bite us if we don't fix it. > > How can I convince you? I've offered to produce a working example of > this > problem. Will you review those results? If I cannot convince you, I > feel > I'll have to pursue distro-specific changes for this work. Ben, the main cause of this kind of issue in the current code is the following line: /* * ctx->pos points to the dirent entry number. * *desc->dir_cookie has the cookie for the next entry. We have * to either find the entry with the appropriate number or * revalidate the cookie. */ if (ctx->pos == 0 || nfs_attribute_cache_expired(inode)) { res = nfs_revalidate_mapping(inode, file->f_mapping); if (res < 0) goto out; } That line protects the page cache against changes aften opendir(). It was introduced by Red Hat in commmit 07b5ce8ef2d8 in order to fix a claim of a severe performance problem. These patches _remove_ that protection, because we're now able to cope with more frequent revalidation without needing to restart directory reads from scratch. So no. Without further proof, I don't accept your claim that this patchset introduces a regression. I don't accept your claim that we are required to revalidate the change attribute on every readdir call. We can't do that for NFSv2 or NFSv3 (the latter offers a post_op attribute, not pre-op attribute) and as I already pointed out, there is nothing in POSIX that requires this. If you want to fork the Red Hat kernel over it, then that's your decision.
On 25 Feb 2022, at 10:18, Trond Myklebust wrote: > On Fri, 2022-02-25 at 09:44 -0500, Benjamin Coddington wrote: >> On 25 Feb 2022, at 8:10, Trond Myklebust wrote: >> >>> On Fri, 2022-02-25 at 06:38 -0500, Benjamin Coddington wrote: >>>> On 24 Feb 2022, at 22:51, Trond Myklebust wrote: >>>> >>>>> On Fri, 2022-02-25 at 02:26 +0000, Trond Myklebust wrote: >>>>>> On Thu, 2022-02-24 at 09:53 -0500, Benjamin Coddington wrote: >>>>>>> On 23 Feb 2022, at 16:12, trondmy@kernel.org wrote: >>>>>>> >>>>>>>> From: Trond Myklebust <trond.myklebust@hammerspace.com> >>>>>>>> >>>>>>>> Use the change attribute and the first cookie in a >>>>>>>> directory >>>>>>>> page >>>>>>>> cache >>>>>>>> entry to validate that the page is up to date. >>>>>>>> >>>>>>>> Suggested-by: Benjamin Coddington <bcodding@redhat.com> >>>>>>>> Signed-off-by: Trond Myklebust >>>>>>>> <trond.myklebust@hammerspace.com> >>>>>>>> --- >>>>>>>> fs/nfs/dir.c | 68 >>>>>>>> ++++++++++++++++++++++++++++------------------------ >>>>>>>> 1 file changed, 37 insertions(+), 31 deletions(-) >>>>>>>> >>>>>>>> diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c >>>>>>>> index f2258e926df2..5d9367d9b651 100644 >>>>>>>> --- a/fs/nfs/dir.c >>>>>>>> +++ b/fs/nfs/dir.c >>>>>>>> @@ -139,6 +139,7 @@ struct nfs_cache_array_entry { >>>>>>>> }; >>>>>>>> >>>>>>>> struct nfs_cache_array { >>>>>>>> + u64 change_attr; >>>>>>>> u64 last_cookie; >>>>>>>> unsigned int size; >>>>>>>> unsigned char page_full : 1, >>>>>>>> @@ -175,7 +176,8 @@ static void >>>>>>>> nfs_readdir_array_init(struct >>>>>>>> nfs_cache_array *array) >>>>>>>> memset(array, 0, sizeof(struct >>>>>>>> nfs_cache_array)); >>>>>>>> } >>>>>>>> >>>>>>>> -static void nfs_readdir_page_init_array(struct page >>>>>>>> *page, >>>>>>>> u64 >>>>>>>> last_cookie) >>>>>>>> +static void nfs_readdir_page_init_array(struct page >>>>>>>> *page, >>>>>>>> u64 >>>>>>>> last_cookie, >>>>>>>> + u64 >>>>>>>> change_attr) >>>>>>>> { >>>>>>>> struct nfs_cache_array *array; >>>>>>> >>>>>>> >>>>>>> There's a hunk missing here, something like: >>>>>>> >>>>>>> @@ -185,6 +185,7 @@ static void >>>>>>> nfs_readdir_page_init_array(struct >>>>>>> page >>>>>>> *page, u64 last_cookie, >>>>>>> nfs_readdir_array_init(array); >>>>>>> array->last_cookie = last_cookie; >>>>>>> array->cookies_are_ordered = 1; >>>>>>> + array->change_attr = change_attr; >>>>>>> kunmap_atomic(array); >>>>>>> } >>>>>>> >>>>>>>> >>>>>>>> @@ -207,7 +209,7 @@ nfs_readdir_page_array_alloc(u64 >>>>>>>> last_cookie, >>>>>>>> gfp_t gfp_flags) >>>>>>>> { >>>>>>>> struct page *page = alloc_page(gfp_flags); >>>>>>>> if (page) >>>>>>>> - nfs_readdir_page_init_array(page, >>>>>>>> last_cookie); >>>>>>>> + nfs_readdir_page_init_array(page, >>>>>>>> last_cookie, >>>>>>>> 0); >>>>>>>> return page; >>>>>>>> } >>>>>>>> >>>>>>>> @@ -304,19 +306,44 @@ int nfs_readdir_add_to_array(struct >>>>>>>> nfs_entry >>>>>>>> *entry, struct page *page) >>>>>>>> return ret; >>>>>>>> } >>>>>>>> >>>>>>>> +static bool nfs_readdir_page_cookie_match(struct page >>>>>>>> *page, >>>>>>>> u64 >>>>>>>> last_cookie, >>>>>>>> + >>>>>>>> u64 change_attr) >>>>>>> >>>>>>> How about "nfs_readdir_page_valid()"? There's more going >>>>>>> on >>>>>>> than a >>>>>>> cookie match. >>>>>>> >>>>>>> >>>>>>>> +{ >>>>>>>> + struct nfs_cache_array *array = >>>>>>>> kmap_atomic(page); >>>>>>>> + int ret = true; >>>>>>>> + >>>>>>>> + if (array->change_attr != change_attr) >>>>>>>> + ret = false; >>>>>>> >>>>>>> Can we skip the next test if ret = false? >>>>>> >>>>>> I'd expect the compiler to do that. >>>>>> >>>>>>> >>>>>>>> + if (array->size > 0 && array->array[0].cookie != >>>>>>>> last_cookie) >>>>>>>> + ret = false; >>>>>>>> + kunmap_atomic(array); >>>>>>>> + return ret; >>>>>>>> +} >>>>>>>> + >>>>>>>> +static void nfs_readdir_page_unlock_and_put(struct page >>>>>>>> *page) >>>>>>>> +{ >>>>>>>> + unlock_page(page); >>>>>>>> + put_page(page); >>>>>>>> +} >>>>>>>> + >>>>>>>> static struct page *nfs_readdir_page_get_locked(struct >>>>>>>> address_space >>>>>>>> *mapping, >>>>>>>> pgoff_t >>>>>>>> index, >>>>>>>> u64 >>>>>>>> last_cookie) >>>>>>>> { >>>>>>>> struct page *page; >>>>>>>> + u64 change_attr; >>>>>>>> >>>>>>>> page = grab_cache_page(mapping, index); >>>>>>>> - if (page && !PageUptodate(page)) { >>>>>>>> - nfs_readdir_page_init_array(page, >>>>>>>> last_cookie); >>>>>>>> - if >>>>>>>> (invalidate_inode_pages2_range(mapping, index >>>>>>>> + >>>>>>>> 1, -1) < 0) >>>>>>>> - nfs_zap_mapping(mapping->host, >>>>>>>> mapping); >>>>>>>> - SetPageUptodate(page); >>>>>>>> + if (!page) >>>>>>>> + return NULL; >>>>>>>> + change_attr = >>>>>>>> inode_peek_iversion_raw(mapping->host); >>>>>>>> + if (PageUptodate(page)) { >>>>>>>> + if >>>>>>>> (nfs_readdir_page_cookie_match(page, >>>>>>>> last_cookie, >>>>>>>> + >>>>>>>> change_attr)) >>>>>>>> + return page; >>>>>>>> + nfs_readdir_clear_array(page); >>>>>>> >>>>>>> >>>>>>> Why use i_version rather than nfs_save_change_attribute? >>>>>>> Seems >>>>>>> having a >>>>>>> consistent value across the pachecache and dir_verifiers >>>>>>> would >>>>>>> help >>>>>>> debugging, and we've already have a bunch of machinery >>>>>>> around >>>>>>> the >>>>>>> change_attribute. >>>>>> >>>>>> The directory cache_change_attribute is not reported in >>>>>> tracepoints >>>>>> because it is a directory-specific field, so it's not as >>>>>> useful >>>>>> for >>>>>> debugging. >>>>>> >>>>>> The inode change attribute is what we have traditionally used >>>>>> for >>>>>> determining cache consistency, and when to invalidate the >>>>>> cache. >>>>> >>>>> I should probably elaborate a little further on the differences >>>>> between >>>>> the inode change attribute and the cache_change_attribute. >>>>> >>>>> One of the main reasons for introducing the latter was to have >>>>> something that allows us to track changes to the directory, but >>>>> to >>>>> avoid forcing unnecessary revalidations of the dcache. >>>>> >>>>> What this means is that when we create or remove a file, and >>>>> the >>>>> pre/post-op attributes tell us that there were no third party >>>>> changes >>>>> to the directory, we update the dcache, but we do _not_ update >>>>> the >>>>> cache_change_attribute, because we know that the rest of the >>>>> directory >>>>> contents are valid, and so we don't have to revalidate the >>>>> dentries. >>>>> However in that case, we _do_ want to update the readdir cache >>>>> to >>>>> reflect the fact that an entry was added or deleted. While we >>>>> could >>>>> figure out how to remove an entry (at least for the case where >>>>> the >>>>> filesystem is case-sensitive), we do not know where the >>>>> filesystem >>>>> added the new file, or what cookies was assigned. >>>>> >>>>> This is why the inode change attribute is more appropriate for >>>>> indexing >>>>> the page cache pages. It reflects the cases where we want to >>>>> revalidate >>>>> the readdir cache, as opposed to the dcache. >>>> >>>> Ok, thanks for explaining this. >>>> >>>> I've noticed that you haven't responded about my concerns about >>>> not >>>> checking >>>> the directory for changes with every v4 READDIR. For v3, we have >>>> post-op >>>> updates to the directory, but with v4 the directory can change >>>> and >>>> we'll >>>> end up with entries in the cache that are marked with an old >>>> change_attr. >>>> >>> >>> Then they will be rejected by nfs_readdir_page_cookie_match() if a >>> user >>> looks up that page again after we've revalidated the change >>> attribute >>> on the directory. >>> >>> ...and note that NFSv4 does returns a struct change_info4 for all >>> operations that change the directory, so we will update the change >>> attribute in all those cases. >> >> I'm not worried about changes from the same client. >> >>> If the change is made on the server, well then we will detect it >>> through the standard revalidation process that usually decides when >>> to >>> invalidate the directory page cache. >> >> The environments I'm concerned about are setup very frequently: they >> look >> like multiple NFS clients co-ordinating on a directory with millions >> of >> files. Some clients are adding files as they do work, other clients >> are >> then looking for those files by walking the directory entries to >> validate >> their existence. The systems that do this have a "very bad time" if >> some >> of them produce listings that are _dramatically_ and transiently >> different >> from a listing they produced before. >> >> That can happen really easily with what we've got here, and it can >> create a >> huge problem for these setups. And it won't be easily >> reproduceable, >> and it >> will be hard to find. It will cost everyone involved a lot of time >> and >> effort to track down, and we can fix it easily. >> >>>> I'm pretty positive that not checking for changes to the >>>> directory >>>> (not >>>> sending GETATTR with READDIR) is going to create cases of double- >>>> listed >>>> and >>>> truncated-listings for dirctory listers. Not handling those >>>> cases >>>> means >>>> I'm >>>> going to have some very unhappy customers that complain about >>>> their >>>> files >>>> disappearing/reappearing on NFS. >>>> >>>> If you need me to prove that its an issue, I can take the time to >>>> write >>>> up >>>> program that shows this problem. >>>> >>> >>> If you label the page contents with an attribute that was retrieved >>> _after_ the READDIR op, then you will introduce this as a problem >>> for >>> your customers. >> >> No the problem is already here, we're not introducing it. By >> labeling >> the >> page contents with every call we're shifting the race window from the >> client >> where it's a very large window to the server where the window is >> small. >> >> Its still possible, but *much* less likely. >> >>> The reason is that there is no atomicity between operations in a >>> COMPOUND. Worse, the implementation of readdir in scalable modern >>> systems, including Linux, does not even guarantee atomicity of the >>> readdir operation itself. Instead each readdir entry is filled >>> without >>> holding any locks or preventing any changes to the directory or to >>> the >>> object itself. >> >> I understand all this, but its not a reason to make the problem >> worse. >> >>> POSIX states very explicitly that if you're making changes to the >>> directory after the call to opendir() or rewinddir(), then the >>> behaviour w.r.t. whether that file appears in the readdir() call is >>> unspecified. See >>> https://pubs.opengroup.org/onlinepubs/9699919799/functions/readdir.html >> >> Yes, but again - just because the problem exists doesn't give us >> reason >> to >> amplify it when we can easily make a better choice for almost no >> cost. >> >> Here are my reasons for wanting the GETATTR added: >> - it makes it *much* less likely for this problem to occur, with >> the >> minor >> downside of decreased caching for unstable directories. >> - it makes v3 and v4 readdir pagecache behavior consistent WRT >> changing >> directories. >> >> I spent a non-trivial amount of time working on this problem, and saw >> this >> exact issue appear. Its definitely something that's going to come >> back >> and >> bite us if we don't fix it. >> >> How can I convince you? I've offered to produce a working example >> of >> this >> problem. Will you review those results? If I cannot convince you, >> I >> feel >> I'll have to pursue distro-specific changes for this work. > > Ben, the main cause of this kind of issue in the current code is the > following line: > > /* > * ctx->pos points to the dirent entry number. > * *desc->dir_cookie has the cookie for the next entry. We > have > * to either find the entry with the appropriate number or > * revalidate the cookie. > */ > if (ctx->pos == 0 || nfs_attribute_cache_expired(inode)) { > res = nfs_revalidate_mapping(inode, file->f_mapping); > if (res < 0) > goto out; > } > > > That line protects the page cache against changes aften opendir(). It > was introduced by Red Hat in commmit 07b5ce8ef2d8 in order to fix a > claim of a severe performance problem. > > These patches _remove_ that protection, because we're now able to cope > with more frequent revalidation without needing to restart directory > reads from scratch. Yes, I know. But the big change is that now we're heavily relying on page validation to produce sane listing results, and proper page validation relies on up-to-date change info. > So no. Without further proof, I don't accept your claim that this > patchset introduces a regression. I don't accept your claim that we > are > required to revalidate the change attribute on every readdir call. We > can't do that for NFSv2 or NFSv3 (the latter offers a post_op > attribute, not pre-op attribute) and as I already pointed out, there > is > nothing in POSIX that requires this. You don't need a pre-op attribute. You just need to detect the case where you're walking into pages that contain entries that don't match the ones you're currently using, and post-op is as good as we can get it. Ok, so I'm reading that further proof is required, and I'm happy to do the work. Thanks for the replies here and elsewhere. Ben
On 25 Feb 2022, at 10:34, Benjamin Coddington wrote: > Ok, so I'm reading that further proof is required, and I'm happy to do > the > work. Thanks for the replies here and elsewhere. Here's an example of this problem on a tmpfs export using v8 of your patchset with the fix to set the change_attr in nfs_readdir_page_init_array(). I'm using tmpfs, because it reliably orders cookies in reverse order of creation (or perhaps sorted by name). The program drives both the client-side and server-side - so on this one system, /exports/tmpfs is: tmpfs /exports/tmpfs tmpfs rw,seclabel,relatime,size=102400k 0 0 and /mnt/localhost is: localhost:/exports/tmpfs /mnt/localhost/tmpfs nfs4 rw,relatime,vers=4.1,rsize=1048576,wsize=1048576,namlen=255,hard,proto=tcp,timeo=600,retrans=2,sec=sys,clientaddr=127.0.0.1,local_lock=none,addr=127.0.0.1 0 0 The program creates 256 files on the server, walks through them once on the client, deletes the last 127 on the server, drops the first page from the pagecache, and walks through them again on the client. The second listing produces 124 duplicate entries. I just have to say again: this behavior is _new_ (but not new to me), and it is absolutely going to crop up on our customer's systems that are walking through millions of directory entries on loaded servers under memory pressure. The directory listings as a whole become very likely to be nonsense at random times. I realize they are not /supposed/ to be coherent, but what we're getting here is going to be far far less coherent, and its going to be a mess. There are other scenarios that are worse when the cookies aren't ordered, you can end up with EOF, or get into repeating patterns. Please compare this with v3, and before this patchset, and tell me if I'm not justified playing chicken little. Here's what I do to run this: mount -t tmpfs -osize=100M tmpfs /exports/tmpfs/ exportfs -ofsid=0 *:/exports exportfs -ofsid=1 *:/exports/tmpfs mount -t nfs -ov4.1,sec=sys localhost:/exports /mnt/localhost ./getdents2 Compare "Listing 1" with "Listing 2". I would also do a "rm -f /export/tmpfs/*" between each run. Thanks again for your time and work. Ben #define _GNU_SOURCE #include <stdio.h> #include <unistd.h> #include <fcntl.h> #include <sched.h> #include <sys/types.h> #include <sys/stat.h> #include <sys/syscall.h> #include <string.h> #define NFSDIR "/mnt/localhost/tmpfs" #define LOCDIR "/exports/tmpfs" #define BUF_SIZE 4096 int main(int argc, char **argv) { int i, dir_fd, bpos, total = 0; size_t nread; struct linux_dirent { long d_ino; off_t d_off; unsigned short d_reclen; char d_name[]; }; struct linux_dirent *d; char buf[BUF_SIZE]; /* create files: */ for (i = 0; i < 256; i++) { sprintf(buf, LOCDIR "/file_%03d", i); close(open(buf, O_CREAT, 666)); } dir_fd = open(NFSDIR, O_RDONLY|O_NONBLOCK|O_DIRECTORY|O_CLOEXEC); if (dir_fd < 0) { perror("cannot open dir"); return 1; } while (1) { nread = syscall(SYS_getdents, dir_fd, buf, BUF_SIZE); if (nread == 0 || nread == -1) break; for (bpos = 0; bpos < nread;) { d = (struct linux_dirent *) (buf + bpos); printf("%s\n", d->d_name); total++; bpos += d->d_reclen; } } printf("Listing 1: %d total dirents\n", total); /* rewind */ lseek(dir_fd, 0, SEEK_SET); /* drop the first page */ posix_fadvise(dir_fd, 0, 4096, POSIX_FADV_DONTNEED); /* delete the last 127 files: */ for (i = 127; i < 256; i++) { sprintf(buf, LOCDIR "/file_%03d", i); unlink(buf); } total = 0; while (1) { nread = syscall(SYS_getdents, dir_fd, buf, BUF_SIZE); if (nread == 0 || nread == -1) break; for (bpos = 0; bpos < nread;) { d = (struct linux_dirent *) (buf + bpos); printf("%s\n", d->d_name); total++; bpos += d->d_reclen; } } printf("Listing 2: %d total dirents\n", total); close(dir_fd); return 0; }
On 25 Feb 2022, at 15:23, Benjamin Coddington wrote: > int main(int argc, char **argv) > { > int i, dir_fd, bpos, total = 0; > size_t nread; > struct linux_dirent { Ugh.. and sorry about the whitespace mess.
On Fri, 2022-02-25 at 15:23 -0500, Benjamin Coddington wrote: > On 25 Feb 2022, at 10:34, Benjamin Coddington wrote: > > Ok, so I'm reading that further proof is required, and I'm happy to > > do > > the > > work. Thanks for the replies here and elsewhere. > > Here's an example of this problem on a tmpfs export using v8 of your > patchset with the fix to set the change_attr in > nfs_readdir_page_init_array(). > > I'm using tmpfs, because it reliably orders cookies in reverse order > of > creation (or perhaps sorted by name). > > The program drives both the client-side and server-side - so on this > one > system, /exports/tmpfs is: > tmpfs /exports/tmpfs tmpfs rw,seclabel,relatime,size=102400k 0 0 > > and /mnt/localhost is: > localhost:/exports/tmpfs /mnt/localhost/tmpfs nfs4 > rw,relatime,vers=4.1,rsize=1048576,wsize=1048576,namlen=255,hard,prot > o=tcp,timeo=600,retrans=2,sec=sys,clientaddr=127.0.0.1,local_lock=non > e,addr=127.0.0.1 > 0 0 > > The program creates 256 files on the server, walks through them once > on > the > client, deletes the last 127 on the server, drops the first page from > the > pagecache, and walks through them again on the client. > > The second listing produces 124 duplicate entries. > > I just have to say again: this behavior is _new_ (but not new to me), > and it > is absolutely going to crop up on our customer's systems that are > walking > through millions of directory entries on loaded servers under memory > pressure. The directory listings as a whole become very likely to be > nonsense at random times. I realize they are not /supposed/ to be > coherent, > but what we're getting here is going to be far far less coherent, and > its > going to be a mess. > > There are other scenarios that are worse when the cookies aren't > ordered, > you can end up with EOF, or get into repeating patterns. > > Please compare this with v3, and before this patchset, and tell me if > I'm > not justified playing chicken little. > > Here's what I do to run this: > > mount -t tmpfs -osize=100M tmpfs /exports/tmpfs/ > exportfs -ofsid=0 *:/exports > exportfs -ofsid=1 *:/exports/tmpfs > mount -t nfs -ov4.1,sec=sys localhost:/exports /mnt/localhost > ./getdents2 > > Compare "Listing 1" with "Listing 2". > > I would also do a "rm -f /export/tmpfs/*" between each run. > > Thanks again for your time and work. > > Ben > > #define _GNU_SOURCE > #include <stdio.h> > #include <unistd.h> > #include <fcntl.h> > #include <sched.h> > #include <sys/types.h> > #include <sys/stat.h> > #include <sys/syscall.h> > #include <string.h> > > #define NFSDIR "/mnt/localhost/tmpfs" > #define LOCDIR "/exports/tmpfs" > #define BUF_SIZE 4096 > > int main(int argc, char **argv) > { > int i, dir_fd, bpos, total = 0; > size_t nread; > struct linux_dirent { > long d_ino; > off_t d_off; > unsigned short d_reclen; > char d_name[]; > }; > struct linux_dirent *d; > char buf[BUF_SIZE]; > > /* create files: */ > for (i = 0; i < 256; i++) { > sprintf(buf, LOCDIR "/file_%03d", i); > close(open(buf, O_CREAT, 666)); > } > > dir_fd = open(NFSDIR, > O_RDONLY|O_NONBLOCK|O_DIRECTORY|O_CLOEXEC); > if (dir_fd < 0) { > perror("cannot open dir"); > return 1; > } > > while (1) { > nread = syscall(SYS_getdents, dir_fd, buf, BUF_SIZE); > if (nread == 0 || nread == -1) > break; > for (bpos = 0; bpos < nread;) { > d = (struct linux_dirent *) (buf + bpos); > printf("%s\n", d->d_name); > total++; > bpos += d->d_reclen; > } > } > printf("Listing 1: %d total dirents\n", total); > > /* rewind */ > lseek(dir_fd, 0, SEEK_SET); > > /* drop the first page */ > posix_fadvise(dir_fd, 0, 4096, POSIX_FADV_DONTNEED); > > /* delete the last 127 files: */ > for (i = 127; i < 256; i++) { > sprintf(buf, LOCDIR "/file_%03d", i); > unlink(buf); > } > > total = 0; > while (1) { > nread = syscall(SYS_getdents, dir_fd, buf, BUF_SIZE); > if (nread == 0 || nread == -1) > break; > for (bpos = 0; bpos < nread;) { > d = (struct linux_dirent *) (buf + bpos); > printf("%s\n", d->d_name); > total++; > bpos += d->d_reclen; > } > } > printf("Listing 2: %d total dirents\n", total); > > close(dir_fd); > return 0; > } tmpfs is broken on the server. It doesn't provide stable cookies, and knfsd doesn't use the verifier to tell you that the cookie assignment has changed. Re-export of tmpfs has never worked reliably.
On 25 Feb 2022, at 15:41, Trond Myklebust wrote: > On Fri, 2022-02-25 at 15:23 -0500, Benjamin Coddington wrote: >> On 25 Feb 2022, at 10:34, Benjamin Coddington wrote: >>> Ok, so I'm reading that further proof is required, and I'm happy to >>> do >>> the >>> work. Thanks for the replies here and elsewhere. >> >> Here's an example of this problem on a tmpfs export using v8 of your >> patchset with the fix to set the change_attr in >> nfs_readdir_page_init_array(). >> >> I'm using tmpfs, because it reliably orders cookies in reverse order >> of >> creation (or perhaps sorted by name). >> >> The program drives both the client-side and server-side - so on this >> one >> system, /exports/tmpfs is: >> tmpfs /exports/tmpfs tmpfs rw,seclabel,relatime,size=102400k 0 0 >> >> and /mnt/localhost is: >> localhost:/exports/tmpfs /mnt/localhost/tmpfs nfs4 >> rw,relatime,vers=4.1,rsize=1048576,wsize=1048576,namlen=255,hard,prot >> o=tcp,timeo=600,retrans=2,sec=sys,clientaddr=127.0.0.1,local_lock=non >> e,addr=127.0.0.1 >> 0 0 >> >> The program creates 256 files on the server, walks through them once >> on >> the >> client, deletes the last 127 on the server, drops the first page from >> the >> pagecache, and walks through them again on the client. >> >> The second listing produces 124 duplicate entries. >> >> I just have to say again: this behavior is _new_ (but not new to me), >> and it >> is absolutely going to crop up on our customer's systems that are >> walking >> through millions of directory entries on loaded servers under memory >> pressure. The directory listings as a whole become very likely to >> be >> nonsense at random times. I realize they are not /supposed/ to be >> coherent, >> but what we're getting here is going to be far far less coherent, and >> its >> going to be a mess. >> >> There are other scenarios that are worse when the cookies aren't >> ordered, >> you can end up with EOF, or get into repeating patterns. >> >> Please compare this with v3, and before this patchset, and tell me if >> I'm >> not justified playing chicken little. >> >> Here's what I do to run this: >> >> mount -t tmpfs -osize=100M tmpfs /exports/tmpfs/ >> exportfs -ofsid=0 *:/exports >> exportfs -ofsid=1 *:/exports/tmpfs >> mount -t nfs -ov4.1,sec=sys localhost:/exports /mnt/localhost >> ./getdents2 >> >> Compare "Listing 1" with "Listing 2". >> >> I would also do a "rm -f /export/tmpfs/*" between each run. >> >> Thanks again for your time and work. >> >> Ben >> >> #define _GNU_SOURCE >> #include <stdio.h> >> #include <unistd.h> >> #include <fcntl.h> >> #include <sched.h> >> #include <sys/types.h> >> #include <sys/stat.h> >> #include <sys/syscall.h> >> #include <string.h> >> >> #define NFSDIR "/mnt/localhost/tmpfs" >> #define LOCDIR "/exports/tmpfs" >> #define BUF_SIZE 4096 >> >> int main(int argc, char **argv) >> { >> int i, dir_fd, bpos, total = 0; >> size_t nread; >> struct linux_dirent { >> long d_ino; >> off_t d_off; >> unsigned short d_reclen; >> char d_name[]; >> }; >> struct linux_dirent *d; >> char buf[BUF_SIZE]; >> >> /* create files: */ >> for (i = 0; i < 256; i++) { >> sprintf(buf, LOCDIR "/file_%03d", i); >> close(open(buf, O_CREAT, 666)); >> } >> >> dir_fd = open(NFSDIR, >> O_RDONLY|O_NONBLOCK|O_DIRECTORY|O_CLOEXEC); >> if (dir_fd < 0) { >> perror("cannot open dir"); >> return 1; >> } >> >> while (1) { >> nread = syscall(SYS_getdents, dir_fd, >> buf, BUF_SIZE); >> if (nread == 0 || nread == -1) >> break; >> for (bpos = 0; bpos < nread;) { >> d = (struct linux_dirent *) (buf + bpos); >> printf("%s\n", d->d_name); >> total++; >> bpos += d->d_reclen; >> } >> } >> printf("Listing 1: %d total dirents\n", total); >> >> /* rewind */ >> lseek(dir_fd, 0, SEEK_SET); >> >> /* drop the first page */ >> posix_fadvise(dir_fd, 0, 4096, POSIX_FADV_DONTNEED); >> >> /* delete the last 127 files: */ >> for (i = 127; i < 256; i++) { >> sprintf(buf, LOCDIR "/file_%03d", i); >> unlink(buf); >> } >> >> total = 0; >> while (1) { >> nread = syscall(SYS_getdents, dir_fd, >> buf, BUF_SIZE); >> if (nread == 0 || nread == -1) >> break; >> for (bpos = 0; bpos < nread;) { >> d = (struct linux_dirent *) (buf + bpos); >> printf("%s\n", d->d_name); >> total++; >> bpos += d->d_reclen; >> } >> } >> printf("Listing 2: %d total dirents\n", total); >> >> close(dir_fd); >> return 0; >> } > > > tmpfs is broken on the server. It doesn't provide stable cookies, and > knfsd doesn't use the verifier to tell you that the cookie assignment > has changed. > > > Re-export of tmpfs has never worked reliably. In this case, the cookies are stable, they can be verified with a wire capture. I've just adapted the program slightly to ext4 below. In this case the Listing 2 shows files "file_125 file_126" that don't exist on the server and leave out files "elif_125 elif_126". It would take me more time to produce more dramatic results, but I'm sure I could produce them. And its not hard to understand either, so I'm not sure why so much proof is needed. #define _GNU_SOURCE #include <stdio.h> #include <unistd.h> #include <fcntl.h> #include <sched.h> #include <sys/types.h> #include <sys/stat.h> #include <sys/syscall.h> #include <string.h> #define NFSDIR "/mnt/localhost/ext4" #define LOCDIR "/exports/ext4" #define BUF_SIZE 4096 int main(int argc, char **argv) { int i, dir_fd, bpos, total = 0; size_t nread; struct linux_dirent { long d_ino; off_t d_off; unsigned short d_reclen; char d_name[]; }; struct linux_dirent *d; char buf[BUF_SIZE]; /* create files: */ for (i = 0; i < 256; i++) { sprintf(buf, LOCDIR "/file_%03d", i); close(open(buf, O_CREAT, 666)); } dir_fd = open(NFSDIR, O_RDONLY|O_NONBLOCK|O_DIRECTORY|O_CLOEXEC); if (dir_fd < 0) { perror("cannot open dir"); return 1; } while (1) { nread = syscall(SYS_getdents, dir_fd, buf, BUF_SIZE); if (nread == 0 || nread == -1) break; for (bpos = 0; bpos < nread;) { d = (struct linux_dirent *) (buf + bpos); printf("%s\n", d->d_name); total++; bpos += d->d_reclen; } } printf("Listing 1: %d total dirents\n", total); /* rewind */ lseek(dir_fd, 0, SEEK_SET); /* drop the first page */ posix_fadvise(dir_fd, 0, 4096, POSIX_FADV_DONTNEED); /* delete the first 127 files: */ for (i = 0; i < 127; i++) { sprintf(buf, LOCDIR "/file_%03d", i); unlink(buf); } /* create 127 more: */ for (i = 0; i < 127; i++) { sprintf(buf, LOCDIR "/elif_%03d", i); close(open(buf, O_CREAT, 666)); } total = 0; while (1) { nread = syscall(SYS_getdents, dir_fd, buf, BUF_SIZE); if (nread == 0 || nread == -1) break; for (bpos = 0; bpos < nread;) { d = (struct linux_dirent *) (buf + bpos); printf("%s\n", d->d_name); total++; bpos += d->d_reclen; } } printf("Listing 2: %d total dirents\n", total); close(dir_fd); return 0; }
On Fri, 2022-02-25 at 15:41 -0500, Trond Myklebust wrote: > On Fri, 2022-02-25 at 15:23 -0500, Benjamin Coddington wrote: > > On 25 Feb 2022, at 10:34, Benjamin Coddington wrote: > > > Ok, so I'm reading that further proof is required, and I'm happy > > > to > > > do > > > the > > > work. Thanks for the replies here and elsewhere. > > > > Here's an example of this problem on a tmpfs export using v8 of > > your > > patchset with the fix to set the change_attr in > > nfs_readdir_page_init_array(). > > > > I'm using tmpfs, because it reliably orders cookies in reverse > > order > > of > > creation (or perhaps sorted by name). > > > > The program drives both the client-side and server-side - so on > > this > > one > > system, /exports/tmpfs is: > > tmpfs /exports/tmpfs tmpfs rw,seclabel,relatime,size=102400k 0 0 > > > > and /mnt/localhost is: > > localhost:/exports/tmpfs /mnt/localhost/tmpfs nfs4 > > rw,relatime,vers=4.1,rsize=1048576,wsize=1048576,namlen=255,hard,pr > > ot > > o=tcp,timeo=600,retrans=2,sec=sys,clientaddr=127.0.0.1,local_lock=n > > on > > e,addr=127.0.0.1 > > 0 0 > > > > The program creates 256 files on the server, walks through them > > once > > on > > the > > client, deletes the last 127 on the server, drops the first page > > from > > the > > pagecache, and walks through them again on the client. > > > > The second listing produces 124 duplicate entries. > > > > I just have to say again: this behavior is _new_ (but not new to > > me), > > and it > > is absolutely going to crop up on our customer's systems that are > > walking > > through millions of directory entries on loaded servers under > > memory > > pressure. The directory listings as a whole become very likely to > > be > > nonsense at random times. I realize they are not /supposed/ to be > > coherent, > > but what we're getting here is going to be far far less coherent, > > and > > its > > going to be a mess. > > > > There are other scenarios that are worse when the cookies aren't > > ordered, > > you can end up with EOF, or get into repeating patterns. > > > > Please compare this with v3, and before this patchset, and tell me > > if > > I'm > > not justified playing chicken little. > > > > Here's what I do to run this: > > > > mount -t tmpfs -osize=100M tmpfs /exports/tmpfs/ > > exportfs -ofsid=0 *:/exports > > exportfs -ofsid=1 *:/exports/tmpfs > > mount -t nfs -ov4.1,sec=sys localhost:/exports /mnt/localhost > > ./getdents2 > > > > Compare "Listing 1" with "Listing 2". > > > > I would also do a "rm -f /export/tmpfs/*" between each run. > > > > Thanks again for your time and work. > > > > Ben > > > > #define _GNU_SOURCE > > #include <stdio.h> > > #include <unistd.h> > > #include <fcntl.h> > > #include <sched.h> > > #include <sys/types.h> > > #include <sys/stat.h> > > #include <sys/syscall.h> > > #include <string.h> > > > > #define NFSDIR "/mnt/localhost/tmpfs" > > #define LOCDIR "/exports/tmpfs" > > #define BUF_SIZE 4096 > > > > int main(int argc, char **argv) > > { > > int i, dir_fd, bpos, total = 0; > > size_t nread; > > struct linux_dirent { > > long d_ino; > > off_t d_off; > > unsigned short d_reclen; > > char d_name[]; > > }; > > struct linux_dirent *d; > > char buf[BUF_SIZE]; > > > > /* create files: */ > > for (i = 0; i < 256; i++) { > > sprintf(buf, LOCDIR "/file_%03d", i); > > close(open(buf, O_CREAT, 666)); > > } > > > > dir_fd = open(NFSDIR, > > O_RDONLY|O_NONBLOCK|O_DIRECTORY|O_CLOEXEC); > > if (dir_fd < 0) { > > perror("cannot open dir"); > > return 1; > > } > > > > while (1) { > > nread = syscall(SYS_getdents, dir_fd, buf, > > BUF_SIZE); > > if (nread == 0 || nread == -1) > > break; > > for (bpos = 0; bpos < nread;) { > > d = (struct linux_dirent *) (buf + bpos); > > printf("%s\n", d->d_name); > > total++; > > bpos += d->d_reclen; > > } > > } > > printf("Listing 1: %d total dirents\n", total); > > > > /* rewind */ > > lseek(dir_fd, 0, SEEK_SET); > > > > /* drop the first page */ > > posix_fadvise(dir_fd, 0, 4096, POSIX_FADV_DONTNEED); > > > > /* delete the last 127 files: */ > > for (i = 127; i < 256; i++) { > > sprintf(buf, LOCDIR "/file_%03d", i); > > unlink(buf); > > } > > > > total = 0; > > while (1) { > > nread = syscall(SYS_getdents, dir_fd, buf, > > BUF_SIZE); > > if (nread == 0 || nread == -1) > > break; > > for (bpos = 0; bpos < nread;) { > > d = (struct linux_dirent *) (buf + bpos); > > printf("%s\n", d->d_name); > > total++; > > bpos += d->d_reclen; > > } > > } > > printf("Listing 2: %d total dirents\n", total); > > > > close(dir_fd); > > return 0; > > } > > > tmpfs is broken on the server. It doesn't provide stable cookies, and > knfsd doesn't use the verifier to tell you that the cookie assignment > has changed. > > > Re-export of tmpfs has never worked reliably. What I mean is that tmpfs is always a poor choice for NFS because seekdir()/telldir() don't work reliably, and so READDIR cannot work reliably, since it relies on open()+seekdir() to continue reading the directory in successive RPC calls. Anyhow, to get back to your question about whether we should or should not be detecting that the directory changed when you delete the files on the server. The answer is no... Nothing in the above guarantees that the cache is revalidated. NFS close to open cache consistency means that we guarantee to revalidate the cached data on open(), and only then. That guarantee does not extend to lseek() or to the rewinddir/seekdir wrappers. If your application wants stronger cache consistency, then there are tricks to enable that. Now that statx() has the AT_STATX_FORCE_SYNC flag, you could use that to force a revalidation of the directory attributes on the client. You might also use the posix_fadvise() trick to try to clear the cache. However note that none of these tricks are guaranteed to work. They're not reliable now, and that situation is unlikely to change in the future barring a deliberate (and documented!) change in kernel policy. So as of now, the only way to reliably introduce a revalidation point in your testcase above is to close() and then open().
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index f2258e926df2..5d9367d9b651 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -139,6 +139,7 @@ struct nfs_cache_array_entry { }; struct nfs_cache_array { + u64 change_attr; u64 last_cookie; unsigned int size; unsigned char page_full : 1, @@ -175,7 +176,8 @@ static void nfs_readdir_array_init(struct nfs_cache_array *array) memset(array, 0, sizeof(struct nfs_cache_array)); } -static void nfs_readdir_page_init_array(struct page *page, u64 last_cookie) +static void nfs_readdir_page_init_array(struct page *page, u64 last_cookie, + u64 change_attr) { struct nfs_cache_array *array; @@ -207,7 +209,7 @@ nfs_readdir_page_array_alloc(u64 last_cookie, gfp_t gfp_flags) { struct page *page = alloc_page(gfp_flags); if (page) - nfs_readdir_page_init_array(page, last_cookie); + nfs_readdir_page_init_array(page, last_cookie, 0); return page; } @@ -304,19 +306,44 @@ int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page) return ret; } +static bool nfs_readdir_page_cookie_match(struct page *page, u64 last_cookie, + u64 change_attr) +{ + struct nfs_cache_array *array = kmap_atomic(page); + int ret = true; + + if (array->change_attr != change_attr) + ret = false; + if (array->size > 0 && array->array[0].cookie != last_cookie) + ret = false; + kunmap_atomic(array); + return ret; +} + +static void nfs_readdir_page_unlock_and_put(struct page *page) +{ + unlock_page(page); + put_page(page); +} + static struct page *nfs_readdir_page_get_locked(struct address_space *mapping, pgoff_t index, u64 last_cookie) { struct page *page; + u64 change_attr; page = grab_cache_page(mapping, index); - if (page && !PageUptodate(page)) { - nfs_readdir_page_init_array(page, last_cookie); - if (invalidate_inode_pages2_range(mapping, index + 1, -1) < 0) - nfs_zap_mapping(mapping->host, mapping); - SetPageUptodate(page); + if (!page) + return NULL; + change_attr = inode_peek_iversion_raw(mapping->host); + if (PageUptodate(page)) { + if (nfs_readdir_page_cookie_match(page, last_cookie, + change_attr)) + return page; + nfs_readdir_clear_array(page); } - + nfs_readdir_page_init_array(page, last_cookie, change_attr); + SetPageUptodate(page); return page; } @@ -356,12 +383,6 @@ static void nfs_readdir_page_set_eof(struct page *page) kunmap_atomic(array); } -static void nfs_readdir_page_unlock_and_put(struct page *page) -{ - unlock_page(page); - put_page(page); -} - static struct page *nfs_readdir_page_get_next(struct address_space *mapping, pgoff_t index, u64 cookie) { @@ -418,16 +439,6 @@ static int nfs_readdir_search_for_pos(struct nfs_cache_array *array, return -EBADCOOKIE; } -static bool -nfs_readdir_inode_mapping_valid(struct nfs_inode *nfsi) -{ - if (nfsi->cache_validity & (NFS_INO_INVALID_CHANGE | - NFS_INO_INVALID_DATA)) - return false; - smp_rmb(); - return !test_bit(NFS_INO_INVALIDATING, &nfsi->flags); -} - static bool nfs_readdir_array_cookie_in_range(struct nfs_cache_array *array, u64 cookie) { @@ -456,8 +467,7 @@ static int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, struct nfs_inode *nfsi = NFS_I(file_inode(desc->file)); new_pos = nfs_readdir_page_offset(desc->page) + i; - if (desc->attr_gencount != nfsi->attr_gencount || - !nfs_readdir_inode_mapping_valid(nfsi)) { + if (desc->attr_gencount != nfsi->attr_gencount) { desc->duped = 0; desc->attr_gencount = nfsi->attr_gencount; } else if (new_pos < desc->prev_index) { @@ -1094,11 +1104,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) * to either find the entry with the appropriate number or * revalidate the cookie. */ - if (ctx->pos == 0 || nfs_attribute_cache_expired(inode)) { - res = nfs_revalidate_mapping(inode, file->f_mapping); - if (res < 0) - goto out; - } + nfs_revalidate_inode(inode, NFS_INO_INVALID_CHANGE); res = -ENOMEM; desc = kzalloc(sizeof(*desc), GFP_KERNEL);