@@ -1382,6 +1382,8 @@ void *xas_find_marked(struct xa_state *xas, unsigned long max, xa_mark_t mark)
entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset);
if (!entry && !(xa_track_free(xas->xa) && mark == XA_FREE_MARK))
continue;
+ if (xa_is_sibling(entry))
+ continue;
if (!xa_is_node(entry))
return entry;
xas->xa_node = xa_to_node(entry);
@@ -227,6 +227,7 @@ static void *load_creator(void *ptr)
unsigned long index = (3 << RADIX_TREE_MAP_SHIFT) -
(1 << order);
item_insert_order(tree, index, order);
+ xa_set_mark(tree, index, XA_MARK_1);
item_delete_rcu(tree, index);
}
}
@@ -242,8 +243,11 @@ static void *load_worker(void *ptr)
rcu_register_thread();
while (!stop_iteration) {
+ unsigned long find_index = (2 << RADIX_TREE_MAP_SHIFT) + 1;
struct item *item = xa_load(ptr, index);
assert(!xa_is_internal(item));
+ item = xa_find(ptr, &find_index, index, XA_MARK_1);
+ assert(!xa_is_internal(item));
}
rcu_unregister_thread();
Similar to issue fixed in commit cbc02854331ed ("XArray: Do not return sibling entries from xa_load()"), we may return sibling entries from xas_find_marked as following: Thread A: Thread B: xa_store_range(xa, entry, 6, 7, gfp); xa_set_mark(xa, 6, mark) XA_STATE(xas, xa, 6); xas_find_marked(&xas, 7, mark); offset = xas_find_chunk(xas, advance, mark); [offset is 6 which points to a valid entry] xa_store_range(xa, entry, 4, 7, gfp); entry = xa_entry(xa, node, 6); [entry is a sibling of 4] if (!xa_is_node(entry)) return entry; Skip sibling entry like xas_find() does to protect caller from seeing sibling entry from xas_find_marked() or caller may use sibling entry as a valid entry and crash the kernel. Besides, load_race() test is modified to catch mentioned issue and modified load_race() only passes after this fix is merged. Here is an example how this bug could be triggerred in theory in nfs which enables large folio in mapping: Let's take a look at involved racer: 1. How pages could be created and dirtied in nfs. write ksys_write vfs_write new_sync_write nfs_file_write generic_perform_write nfs_write_begin fgf_set_order __filemap_get_folio nfs_write_end nfs_update_folio nfs_writepage_setup nfs_mark_request_dirty filemap_dirty_folio __folio_mark_dirty __xa_set_mark 2. How dirty pages could be deleted in nfs. ioctl do_vfs_ioctl file_ioctl ioctl_preallocate vfs_fallocate nfs42_fallocate nfs42_proc_deallocate truncate_pagecache_range truncate_inode_pages_range truncate_inode_folio filemap_remove_folio page_cache_delete xas_store(&xas, NULL); 3. How dirty pages could be lockless searched sync_file_range ksys_sync_file_range __filemap_fdatawrite_range filemap_fdatawrite_wbc do_writepages writeback_use_writepage writeback_iter writeback_get_folio filemap_get_folios_tag find_get_entry folio = xas_find_marked() folio_try_get(folio) In theory, kernel will crash as following: 1.Create 2.Search 3.Delete /* write page 2,3 */ write ... nfs_write_begin fgf_set_order __filemap_get_folio ... /* index = 2, order = 1 */ xa_store(&xas, folio) nfs_write_end ... __folio_mark_dirty /* sync page 2 and page 3 */ sync_file_range ... find_get_entry folio = xas_find_marked() /* offset will be 2 */ offset = xas_find_chunk() /* delete page 2 and page 3 */ ioctl ... xas_store(&xas, NULL); /* write page 0-3 */ write ... nfs_write_begin fgf_set_order __filemap_get_folio ... /* index = 0, order = 2 */ xa_store(&xas, folio) nfs_write_end ... __folio_mark_dirty /* get sibling entry from offset 2 */ entry = xa_entry(.., 2) /* use sibling entry as folio and crash kernel */ folio_try_get(folio) Signed-off-by: Kemeng Shi <shikemeng@huaweicloud.com> --- lib/xarray.c | 2 ++ tools/testing/radix-tree/multiorder.c | 4 ++++ 2 files changed, 6 insertions(+)