[v3,14/39] dm vdo: implement the chapter volume store

Message ID	20230914191635.39805-15-snitzer@kernel.org (mailing list archive)
State	New, archived
Headers	show Return-Path: <dm-devel-bounces@redhat.com> From: Mike Snitzer <snitzer@kernel.org> To: dm-devel@redhat.com Date: Thu, 14 Sep 2023 15:16:10 -0400 Message-Id: <20230914191635.39805-15-snitzer@kernel.org> In-Reply-To: <20230914191635.39805-1-snitzer@kernel.org> References: <20230914191635.39805-1-snitzer@kernel.org> MIME-Version: 1.0 Subject: [dm-devel] [PATCH v3 14/39] dm vdo: implement the chapter volume store Precedence: list Cc: Matthew Sakai <msakai@redhat.com> Errors-To: dm-devel-bounces@redhat.com Sender: "dm-devel" <dm-devel-bounces@redhat.com> Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit
Series	dm: baseline for the VDO target \| expand [v3,00/39] dm: baseline for the VDO target [v3,01/39] dm: add documentation for dm-vdo target [v3,02/39] dm vdo: add the MurmurHash3 fast hashing algorithm [v3,03/39] dm vdo: add memory allocation utilities [v3,04/39] dm vdo: add basic logging and support utilities [v3,05/39] dm vdo: add type declarations, constants, and simple data structures [v3,06/39] dm vdo: add thread and synchronization utilities [v3,07/39] dm vdo: add specialized request queueing functionality [v3,08/39] dm vdo: add basic hash map data structures [v3,09/39] dm vdo: add deduplication configuration structures [v3,10/39] dm vdo: add deduplication index storage interface [v3,11/39] dm vdo: implement the delta index [v3,12/39] dm vdo: implement the volume index [v3,13/39] dm vdo: implement the open chapter and chapter indexes [v3,14/39] dm vdo: implement the chapter volume store [v3,15/39] dm vdo: implement top-level deduplication index [v3,16/39] dm vdo: implement external deduplication index interface [v3,17/39] dm vdo: add administrative state and action manager [v3,18/39] dm vdo: add vio, the request object for vdo metadata [v3,19/39] dm vdo: add data_vio, the request object which services incoming bios [v3,20/39] dm vdo: add flush support [v3,21/39] dm vdo: add the io_submitter [v3,22/39] dm vdo: add hash locks and hash zones [v3,23/39] dm vdo: add use of the deduplication index in hash zones [v3,24/39] dm vdo: add the compressed block bin packer [v3,25/39] dm vdo: add slab structure, slab journal and reference counters [v3,26/39] dm vdo: add the slab summary [v3,27/39] dm vdo: add the block allocators and physical zones [v3,28/39] dm vdo: add the slab depot [v3,29/39] dm vdo: add the block map [v3,30/39] dm vdo: implement the vdo block map page cache [v3,31/39] dm vdo: add the vdo recovery journal [v3,32/39] dm vdo: add repair of damanged vdo volumes [v3,33/39] dm vdo: add the vdo structure itself [v3,34/39] dm vdo: add the on-disk formats and marshalling of vdo structures [v3,35/39] dm vdo: add statistics reporting [v3,36/39] dm vdo: add sysfs support for setting vdo params and reading stats [v3,37/39] dm vdo: add debugging support [v3,38/39] dm vdo: add the top-level DM target [v3,39/39] dm vdo: enable configuration and building of dm-vdo

diff --git a/drivers/md/dm-vdo/index-page-map.c b/drivers/md/dm-vdo/index-page-map.c new file mode 100644 index 000000000000..592d5aff7c77 --- /dev/null +++ b/drivers/md/dm-vdo/index-page-map.c @@ -0,0 +1,181 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2023 Red Hat + */ + +#include "index-page-map.h" + +#include "errors.h" +#include "hash-utils.h" +#include "logger.h" +#include "memory-alloc.h" +#include "numeric.h" +#include "permassert.h" +#include "string-utils.h" +#include "uds-threads.h" +#include "uds.h" + +/* + * The index page map is conceptually a two-dimensional array indexed by chapter number and index + * page number within the chapter. Each entry contains the number of the last delta list on that + * index page. In order to save memory, the information for the last page in each chapter is not + * recorded, as it is known from the geometry. + */ + +static const u8 PAGE_MAP_MAGIC[] = "ALBIPM02"; + +enum { + PAGE_MAP_MAGIC_LENGTH = sizeof(PAGE_MAP_MAGIC) - 1, +}; + +static inline u32 get_entry_count(const struct geometry *geometry) +{ + return geometry->chapters_per_volume * (geometry->index_pages_per_chapter - 1); +} + +int uds_make_index_page_map(const struct geometry *geometry, struct index_page_map **map_ptr) +{ + int result; + struct index_page_map *map; + + result = UDS_ALLOCATE(1, struct index_page_map, "page map", &map); + if (result != UDS_SUCCESS) + return result; + + map->geometry = geometry; + map->entries_per_chapter = geometry->index_pages_per_chapter - 1; + result = UDS_ALLOCATE(get_entry_count(geometry), + u16, + "Index Page Map Entries", + &map->entries); + if (result != UDS_SUCCESS) { + uds_free_index_page_map(map); + return result; + } + + *map_ptr = map; + return UDS_SUCCESS; +} + +void uds_free_index_page_map(struct index_page_map *map) +{ + if (map != NULL) { + UDS_FREE(map->entries); + UDS_FREE(map); + } +} + +void uds_update_index_page_map(struct index_page_map *map, + u64 virtual_chapter_number, + u32 chapter_number, + u32 index_page_number, + u32 delta_list_number) +{ + size_t slot; + + map->last_update = virtual_chapter_number; + if (index_page_number == map->entries_per_chapter) + return; + + slot = (chapter_number * map->entries_per_chapter) + index_page_number; + map->entries[slot] = delta_list_number; +} + +u32 uds_find_index_page_number(const struct index_page_map *map, + const struct uds_record_name *name, + u32 chapter_number) +{ + u32 delta_list_number = uds_hash_to_chapter_delta_list(name, map->geometry); + u32 slot = chapter_number * map->entries_per_chapter; + u32 page; + + for (page = 0; page < map->entries_per_chapter; page++) { + if (delta_list_number <= map->entries[slot + page]) + break; + } + + return page; +} + +void uds_get_list_number_bounds(const struct index_page_map *map, + u32 chapter_number, + u32 index_page_number, + u32 *lowest_list, + u32 *highest_list) +{ + u32 slot = chapter_number * map->entries_per_chapter; + + *lowest_list = ((index_page_number == 0) ? + 0 : + map->entries[slot + index_page_number - 1] + 1); + *highest_list = ((index_page_number < map->entries_per_chapter) ? + map->entries[slot + index_page_number] : + map->geometry->delta_lists_per_chapter - 1); +} + +u64 uds_compute_index_page_map_save_size(const struct geometry *geometry) +{ + return PAGE_MAP_MAGIC_LENGTH + sizeof(u64) + sizeof(u16) * get_entry_count(geometry); +} + +int uds_write_index_page_map(struct index_page_map *map, struct buffered_writer *writer) +{ + int result; + u8 *buffer; + size_t offset = 0; + u64 saved_size = uds_compute_index_page_map_save_size(map->geometry); + u32 i; + + result = UDS_ALLOCATE(saved_size, u8, "page map data", &buffer); + if (result != UDS_SUCCESS) + return result; + + memcpy(buffer, PAGE_MAP_MAGIC, PAGE_MAP_MAGIC_LENGTH); + offset += PAGE_MAP_MAGIC_LENGTH; + encode_u64_le(buffer, &offset, map->last_update); + for (i = 0; i < get_entry_count(map->geometry); i++) + encode_u16_le(buffer, &offset, map->entries[i]); + + result = uds_write_to_buffered_writer(writer, buffer, offset); + UDS_FREE(buffer); + if (result != UDS_SUCCESS) + return result; + + return uds_flush_buffered_writer(writer); +} + +int uds_read_index_page_map(struct index_page_map *map, struct buffered_reader *reader) +{ + int result; + u8 magic[PAGE_MAP_MAGIC_LENGTH]; + u8 *buffer; + size_t offset = 0; + u64 saved_size = uds_compute_index_page_map_save_size(map->geometry); + u32 i; + + result = UDS_ALLOCATE(saved_size, u8, "page map data", &buffer); + if (result != UDS_SUCCESS) + return result; + + result = uds_read_from_buffered_reader(reader, buffer, saved_size); + if (result != UDS_SUCCESS) { + UDS_FREE(buffer); + return result; + } + + memcpy(&magic, buffer, PAGE_MAP_MAGIC_LENGTH); + offset += PAGE_MAP_MAGIC_LENGTH; + if (memcmp(magic, PAGE_MAP_MAGIC, PAGE_MAP_MAGIC_LENGTH) != 0) { + UDS_FREE(buffer); + return UDS_CORRUPT_DATA; + } + + decode_u64_le(buffer, &offset, &map->last_update); + for (i = 0; i < get_entry_count(map->geometry); i++) + decode_u16_le(buffer, &offset, &map->entries[i]); + + UDS_FREE(buffer); + uds_log_debug("read index page map, last update %llu", + (unsigned long long) map->last_update); + return UDS_SUCCESS; +} diff --git a/drivers/md/dm-vdo/index-page-map.h b/drivers/md/dm-vdo/index-page-map.h new file mode 100644 index 000000000000..14c84051af51 --- /dev/null +++ b/drivers/md/dm-vdo/index-page-map.h @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright 2023 Red Hat + */ + +#ifndef UDS_INDEX_PAGE_MAP_H +#define UDS_INDEX_PAGE_MAP_H + +#include "geometry.h" +#include "io-factory.h" + +/* + * The index maintains a page map which records how the chapter delta lists are distributed among + * the index pages for each chapter, allowing the volume to be efficient about reading only pages + * that it knows it will need. + */ + +struct index_page_map { + const struct geometry *geometry; + u64 last_update; + u32 entries_per_chapter; + u16 *entries; +}; + +int __must_check +uds_make_index_page_map(const struct geometry *geometry, struct index_page_map **map_ptr); + +void uds_free_index_page_map(struct index_page_map *map); + +int __must_check +uds_read_index_page_map(struct index_page_map *map, struct buffered_reader *reader); + +int __must_check +uds_write_index_page_map(struct index_page_map *map, struct buffered_writer *writer); + +void uds_update_index_page_map(struct index_page_map *map, + u64 virtual_chapter_number, + u32 chapter_number, + u32 index_page_number, + u32 delta_list_number); + +u32 __must_check uds_find_index_page_number(const struct index_page_map *map, + const struct uds_record_name *name, + u32 chapter_number); + +void uds_get_list_number_bounds(const struct index_page_map *map, + u32 chapter_number, + u32 index_page_number, + u32 *lowest_list, + u32 *highest_list); + +u64 uds_compute_index_page_map_save_size(const struct geometry *geometry); + +#endif /* UDS_INDEX_PAGE_MAP_H */ diff --git a/drivers/md/dm-vdo/radix-sort.c b/drivers/md/dm-vdo/radix-sort.c new file mode 100644 index 000000000000..7eeb81a6deb6 --- /dev/null +++ b/drivers/md/dm-vdo/radix-sort.c @@ -0,0 +1,349 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2023 Red Hat + */ + +#include "radix-sort.h" + +#include <linux/limits.h> +#include <linux/types.h> + +#include "memory-alloc.h" +#include "string-utils.h" + +/* + * This implementation allocates one large object to do the sorting, which can be reused as many + * times as desired. The amount of memory required is logarithmically proportional to the number of + * keys to be sorted. + */ + +enum { + /* Piles smaller than this are handled with a simple insertion sort. */ + INSERTION_SORT_THRESHOLD = 12, +}; + +/* Sort keys are pointers to immutable fixed-length arrays of bytes. */ +typedef const u8 *sort_key_t; + +/* + * The keys are separated into piles based on the byte in each keys at the current offset, so the + * number of keys with each byte must be counted. + */ +struct histogram { + /* The number of non-empty bins */ + u16 used; + /* The index (key byte) of the first non-empty bin */ + u16 first; + /* The index (key byte) of the last non-empty bin */ + u16 last; + /* The number of occurrences of each specific byte */ + u32 size[256]; +}; + +/* + * Sub-tasks are manually managed on a stack, both for performance and to put a logarithmic bound + * on the stack space needed. + */ +struct task { + /* Pointer to the first key to sort. */ + sort_key_t *first_key; + /* Pointer to the last key to sort. */ + sort_key_t *last_key; + /* The offset into the key at which to continue sorting. */ + u16 offset; + /* The number of bytes remaining in the sort keys. */ + u16 length; +}; + +struct radix_sorter { + unsigned int count; + struct histogram bins; + sort_key_t *pile[256]; + struct task *end_of_stack; + struct task insertion_list[256]; + struct task stack[]; +}; + +/* Compare a segment of two fixed-length keys starting at an offset. */ +static inline int compare(sort_key_t key1, sort_key_t key2, u16 offset, u16 length) +{ + return memcmp(&key1[offset], &key2[offset], length); +} + +/* Insert the next unsorted key into an array of sorted keys. */ +static inline void insert_key(const struct task task, sort_key_t *next) +{ + /* Pull the unsorted key out, freeing up the array slot. */ + sort_key_t unsorted = *next; + + /* Compare the key to the preceding sorted entries, shifting down ones that are larger. */ + while ((--next >= task.first_key) && + (compare(unsorted, next[0], task.offset, task.length) < 0)) + next[1] = next[0]; + + /* Insert the key into the last slot that was cleared, sorting it. */ + next[1] = unsorted; +} + +/* + * Sort a range of key segments using an insertion sort. This simple sort is faster than the + * 256-way radix sort when the number of keys to sort is small. + */ +static inline void insertion_sort(const struct task task) +{ + sort_key_t *next; + + for (next = task.first_key + 1; next <= task.last_key; next++) + insert_key(task, next); +} + +/* Push a sorting task onto a task stack. */ +static inline void push_task(struct task **stack_pointer, + sort_key_t *first_key, + u32 count, + u16 offset, + u16 length) +{ + struct task *task = (*stack_pointer)++; + + task->first_key = first_key; + task->last_key = &first_key[count - 1]; + task->offset = offset; + task->length = length; +} + +static inline void swap_keys(sort_key_t *a, sort_key_t *b) +{ + sort_key_t c = *a; + *a = *b; + *b = c; +} + +/* + * Count the number of times each byte value appears in the arrays of keys to sort at the current + * offset, keeping track of the number of non-empty bins, and the index of the first and last + * non-empty bin. + */ +static inline void measure_bins(const struct task task, struct histogram *bins) +{ + sort_key_t *key_ptr; + + /* + * Subtle invariant: bins->used and bins->size[] are zero because the sorting code clears + * it all out as it goes. Even though this structure is re-used, we don't need to pay to + * zero it before starting a new tally. + */ + bins->first = U8_MAX; + bins->last = 0; + + for (key_ptr = task.first_key; key_ptr <= task.last_key; key_ptr++) { + /* Increment the count for the byte in the key at the current offset. */ + u8 bin = (*key_ptr)[task.offset]; + u32 size = ++bins->size[bin]; + + /* Track non-empty bins. */ + if (size == 1) { + bins->used += 1; + if (bin < bins->first) + bins->first = bin; + + if (bin > bins->last) + bins->last = bin; + } + } +} + +/* + * Convert the bin sizes to pointers to where each pile goes. + * + * pile[0] = first_key + bin->size[0], + * pile[1] = pile[0] + bin->size[1], etc. + * + * After the keys are moved to the appropriate pile, we'll need to sort each of the piles by the + * next radix position. A new task is put on the stack for each pile containing lots of keys, or a + * new task is put on the list for each pile containing few keys. + * + * @stack: pointer the top of the stack + * @end_of_stack: the end of the stack + * @list: pointer the head of the list + * @pile: array for pointers to the end of each pile + * @bins: the histogram of the sizes of each pile + * @first_key: the first key of the stack + * @offset: the next radix position to sort by + * @length: the number of bytes remaining in the sort keys + * + * Return: UDS_SUCCESS or an error code + */ +static inline int push_bins(struct task **stack, + struct task *end_of_stack, + struct task **list, + sort_key_t *pile[], + struct histogram *bins, + sort_key_t *first_key, + u16 offset, + u16 length) +{ + sort_key_t *pile_start = first_key; + int bin; + + for (bin = bins->first; ; bin++) { + u32 size = bins->size[bin]; + + /* Skip empty piles. */ + if (size == 0) + continue; + + /* There's no need to sort empty keys. */ + if (length > 0) { + if (size > INSERTION_SORT_THRESHOLD) { + if (*stack >= end_of_stack) + return UDS_BAD_STATE; + + push_task(stack, pile_start, size, offset, length); + } else if (size > 1) { + push_task(list, pile_start, size, offset, length); + } + } + + pile_start += size; + pile[bin] = pile_start; + if (--bins->used == 0) + break; + } + + return UDS_SUCCESS; +} + +int uds_make_radix_sorter(unsigned int count, struct radix_sorter **sorter) +{ + int result; + unsigned int stack_size = count / INSERTION_SORT_THRESHOLD; + struct radix_sorter *radix_sorter; + + result = UDS_ALLOCATE_EXTENDED(struct radix_sorter, + stack_size, + struct task, + __func__, + &radix_sorter); + if (result != UDS_SUCCESS) + return result; + + radix_sorter->count = count; + radix_sorter->end_of_stack = radix_sorter->stack + stack_size; + *sorter = radix_sorter; + return UDS_SUCCESS; +} + +void uds_free_radix_sorter(struct radix_sorter *sorter) +{ + UDS_FREE(sorter); +} + +/* + * Sort pointers to fixed-length keys (arrays of bytes) using a radix sort. The sort implementation + * is unstable, so the relative ordering of equal keys is not preserved. + */ +int uds_radix_sort(struct radix_sorter *sorter, + const unsigned char *keys[], + unsigned int count, + unsigned short length) +{ + struct task start; + struct histogram *bins = &sorter->bins; + sort_key_t **pile = sorter->pile; + struct task *task_stack = sorter->stack; + + /* All zero-length keys are identical and therefore already sorted. */ + if ((count == 0) || (length == 0)) + return UDS_SUCCESS; + + /* The initial task is to sort the entire length of all the keys. */ + start = (struct task) { + .first_key = keys, + .last_key = &keys[count - 1], + .offset = 0, + .length = length, + }; + + if (count <= INSERTION_SORT_THRESHOLD) { + insertion_sort(start); + return UDS_SUCCESS; + } + + if (count > sorter->count) + return UDS_INVALID_ARGUMENT; + + /* + * Repeatedly consume a sorting task from the stack and process it, pushing new sub-tasks + * onto the stack for each radix-sorted pile. When all tasks and sub-tasks have been + * processed, the stack will be empty and all the keys in the starting task will be fully + * sorted. + */ + for (*task_stack = start; task_stack >= sorter->stack; task_stack--) { + const struct task task = *task_stack; + struct task *insertion_task_list; + int result; + sort_key_t *fence; + sort_key_t *end; + + measure_bins(task, bins); + + /* + * Now that we know how large each bin is, generate pointers for each of the piles + * and push a new task to sort each pile by the next radix byte. + */ + insertion_task_list = sorter->insertion_list; + result = push_bins(&task_stack, + sorter->end_of_stack, + &insertion_task_list, + pile, + bins, + task.first_key, + task.offset + 1, + task.length - 1); + if (result != UDS_SUCCESS) { + memset(bins, 0, sizeof(*bins)); + return result; + } + + /* Now bins->used is zero again. */ + + /* + * Don't bother processing the last pile: when piles 0..N-1 are all in place, then + * pile N must also be in place. + */ + end = task.last_key - bins->size[bins->last]; + bins->size[bins->last] = 0; + + for (fence = task.first_key; fence <= end; ) { + u8 bin; + sort_key_t key = *fence; + + /* + * The radix byte of the key tells us which pile it belongs in. Swap it for + * an unprocessed item just below that pile, and repeat. + */ + while (--pile[bin = key[task.offset]] > fence) + swap_keys(pile[bin], &key); + + /* + * The pile reached the fence. Put the key at the bottom of that pile, + * completing it, and advance the fence to the next pile. + */ + *fence = key; + fence += bins->size[bin]; + bins->size[bin] = 0; + } + + /* Now bins->size[] is all zero again. */ + + /* + * When the number of keys in a task gets small enough, it is faster to use an + * insertion sort than to keep subdividing into tiny piles. + */ + while (--insertion_task_list >= sorter->insertion_list) + insertion_sort(*insertion_task_list); + } + + return UDS_SUCCESS; +} diff --git a/drivers/md/dm-vdo/radix-sort.h b/drivers/md/dm-vdo/radix-sort.h new file mode 100644 index 000000000000..f6f0b62be0ee --- /dev/null +++ b/drivers/md/dm-vdo/radix-sort.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright 2023 Red Hat + */ + +#ifndef UDS_RADIX_SORT_H +#define UDS_RADIX_SORT_H + +/* + * Radix sort is implemented using an American Flag sort, an unstable, in-place 8-bit radix + * exchange sort. This is adapted from the algorithm in the paper by Peter M. McIlroy, Keith + * Bostic, and M. Douglas McIlroy, "Engineering Radix Sort". + * + * http://www.usenix.org/publications/compsystems/1993/win_mcilroy.pdf + */ + +struct radix_sorter; + +int __must_check uds_make_radix_sorter(unsigned int count, struct radix_sorter **sorter); + +void uds_free_radix_sorter(struct radix_sorter *sorter); + +int __must_check uds_radix_sort(struct radix_sorter *sorter, + const unsigned char *keys[], + unsigned int count, + unsigned short length); + +#endif /* UDS_RADIX_SORT_H */ diff --git a/drivers/md/dm-vdo/volume.c b/drivers/md/dm-vdo/volume.c new file mode 100644 index 000000000000..00dc217392b8 --- /dev/null +++ b/drivers/md/dm-vdo/volume.c @@ -0,0 +1,1792 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2023 Red Hat + */ + +#include "volume.h" + +#include <linux/atomic.h> +#include <linux/dm-bufio.h> +#include <linux/err.h> + +#include "chapter-index.h" +#include "config.h" +#include "errors.h" +#include "geometry.h" +#include "hash-utils.h" +#include "index.h" +#include "logger.h" +#include "memory-alloc.h" +#include "permassert.h" +#include "sparse-cache.h" +#include "string-utils.h" +#include "uds-threads.h" + +/* + * The first block of the volume layout is reserved for the volume header, which is no longer used. + * The remainder of the volume is divided into chapters consisting of several pages of records, and + * several pages of static index to use to find those records. The index pages are recorded first, + * followed by the record pages. The chapters are written in order as they are filled, so the + * volume storage acts as a circular log of the most recent chapters, with each new chapter + * overwriting the oldest saved one. + * + * When a new chapter is filled and closed, the records from that chapter are sorted and + * interleaved in approximate temporal order, and assigned to record pages. Then a static delta + * index is generated to store which record page contains each record. The in-memory index page map + * is also updated to indicate which delta lists fall on each chapter index page. This means that + * when a record is read, the volume only has to load a single index page and a single record page, + * rather than search the entire chapter. These index and record pages are written to storage, and + * the index pages are transferred to the page cache under the theory that the most recently + * written chapter is likely to be accessed again soon. + * + * When reading a record, the volume index will indicate which chapter should contain it. The + * volume uses the index page map to determine which chapter index page needs to be loaded, and + * then reads the relevant record page number from the chapter index. Both index and record pages + * are stored in a page cache when read for the common case that subsequent records need the same + * pages. The page cache evicts the least recently accessed entries when caching new pages. In + * addition, the volume uses dm-bufio to manage access to the storage, which may allow for + * additional caching depending on available system resources. + * + * Record requests are handled from cached pages when possible. If a page needs to be read, it is + * placed on a queue along with the request that wants to read it. Any requests for the same page + * that arrive while the read is pending are added to the queue entry. A separate reader thread + * handles the queued reads, adding the page to the cache and updating any requests queued with it + * so they can continue processing. This allows the index zone threads to continue processing new + * requests rather than wait for the storage reads. + * + * When an index rebuild is necessary, the volume reads each stored chapter to determine which + * range of chapters contain valid records, so that those records can be used to reconstruct the + * in-memory volume index. + */ + +enum { + /* The maximum allowable number of contiguous bad chapters */ + MAX_BAD_CHAPTERS = 100, + VOLUME_CACHE_MAX_ENTRIES = (U16_MAX >> 1), + VOLUME_CACHE_QUEUED_FLAG = (1 << 15), + VOLUME_CACHE_MAX_QUEUED_READS = 4096, +}; + +static const u64 BAD_CHAPTER = U64_MAX; + +/* + * The invalidate counter is two 32 bits fields stored together atomically. The low order 32 bits + * are the physical page number of the cached page being read. The high order 32 bits are a + * sequence number. This value is written when the zone that owns it begins or completes a cache + * search. Any other thread will only read the counter in wait_for_pending_searches() while waiting + * to update the cache contents. + */ +union invalidate_counter { + u64 value; + struct { + u32 page; + u32 counter; + }; +}; + +static inline u32 map_to_page_number(struct geometry *geometry, u32 physical_page) +{ + return (physical_page - HEADER_PAGES_PER_VOLUME) % geometry->pages_per_chapter; +} + +static inline u32 map_to_chapter_number(struct geometry *geometry, u32 physical_page) +{ + return (physical_page - HEADER_PAGES_PER_VOLUME) / geometry->pages_per_chapter; +} + +static inline bool is_record_page(struct geometry *geometry, u32 physical_page) +{ + return map_to_page_number(geometry, physical_page) >= geometry->index_pages_per_chapter; +} + +static u32 map_to_physical_page(const struct geometry *geometry, u32 chapter, u32 page) +{ + /* Page zero is the header page, so the first chapter index page is page one. */ + return HEADER_PAGES_PER_VOLUME + (geometry->pages_per_chapter * chapter) + page; +} + +static inline union invalidate_counter +get_invalidate_counter(struct page_cache *cache, unsigned int zone_number) +{ + return (union invalidate_counter) { + .value = READ_ONCE(cache->search_pending_counters[zone_number].atomic_value), + }; +} + +static inline void set_invalidate_counter(struct page_cache *cache, + unsigned int zone_number, + union invalidate_counter invalidate_counter) +{ + WRITE_ONCE(cache->search_pending_counters[zone_number].atomic_value, + invalidate_counter.value); +} + +static inline bool search_pending(union invalidate_counter invalidate_counter) +{ + return (invalidate_counter.counter & 1) != 0; +} + +/* Lock the cache for a zone in order to search for a page. */ +static void +begin_pending_search(struct page_cache *cache, u32 physical_page, unsigned int zone_number) +{ + union invalidate_counter invalidate_counter = get_invalidate_counter(cache, zone_number); + + invalidate_counter.page = physical_page; + invalidate_counter.counter++; + set_invalidate_counter(cache, zone_number, invalidate_counter); + ASSERT_LOG_ONLY(search_pending(invalidate_counter), + "Search is pending for zone %u", + zone_number); + /* + * This memory barrier ensures that the write to the invalidate counter is seen by other + * threads before this thread accesses the cached page. The corresponding read memory + * barrier is in wait_for_pending_searches(). + */ + smp_mb(); +} + +/* Unlock the cache for a zone by clearing its invalidate counter. */ +static void end_pending_search(struct page_cache *cache, unsigned int zone_number) +{ + union invalidate_counter invalidate_counter; + + /* + * This memory barrier ensures that this thread completes reads of the + * cached page before other threads see the write to the invalidate + * counter. + */ + smp_mb(); + + invalidate_counter = get_invalidate_counter(cache, zone_number); + ASSERT_LOG_ONLY(search_pending(invalidate_counter), + "Search is pending for zone %u", + zone_number); + invalidate_counter.counter++; + set_invalidate_counter(cache, zone_number, invalidate_counter); +} + +static void wait_for_pending_searches(struct page_cache *cache, u32 physical_page) +{ + union invalidate_counter initial_counters[MAX_ZONES]; + unsigned int i; + + /* + * We hold the read_threads_mutex. We are waiting for threads that do not hold the + * read_threads_mutex. Those threads have "locked" their targeted page by setting the + * search_pending_counter. The corresponding write memory barrier is in + * begin_pending_search(). + */ + smp_mb(); + + for (i = 0; i < cache->zone_count; i++) + initial_counters[i] = get_invalidate_counter(cache, i); + for (i = 0; i < cache->zone_count; i++) + if (search_pending(initial_counters[i]) && + (initial_counters[i].page == physical_page)) + /* + * There is an active search using the physical page. We need to wait for + * the search to finish. + * + * FIXME: Investigate using wait_event() to wait for the search to finish. + */ + while (initial_counters[i].value == get_invalidate_counter(cache, i).value) + cond_resched(); +} + +static void release_page_buffer(struct cached_page *page) +{ + if (page->buffer != NULL) + dm_bufio_release(UDS_FORGET(page->buffer)); +} + +static void clear_cache_page(struct page_cache *cache, struct cached_page *page) +{ + /* Do not clear read_pending because the read queue relies on it. */ + release_page_buffer(page); + page->physical_page = cache->indexable_pages; + WRITE_ONCE(page->last_used, 0); +} + +static void make_page_most_recent(struct page_cache *cache, struct cached_page *page) +{ + /* + * ASSERTION: We are either a zone thread holding a search_pending_counter, or we are any + * thread holding the read_threads_mutex. + */ + if (atomic64_read(&cache->clock) != READ_ONCE(page->last_used)) + WRITE_ONCE(page->last_used, atomic64_inc_return(&cache->clock)); +} + +/* Select a page to remove from the cache to make space for a new entry. */ +static struct cached_page *select_victim_in_cache(struct page_cache *cache) +{ + struct cached_page *page; + int oldest_index = 0; + s64 oldest_time = S64_MAX; + s64 last_used; + u16 i; + + /* Find the oldest unclaimed page. We hold the read_threads_mutex. */ + for (i = 0; i < cache->cache_slots; i++) { + /* A page with a pending read must not be replaced. */ + if (cache->cache[i].read_pending) + continue; + + last_used = READ_ONCE(cache->cache[i].last_used); + if (last_used <= oldest_time) { + oldest_time = last_used; + oldest_index = i; + } + } + + page = &cache->cache[oldest_index]; + if (page->physical_page != cache->indexable_pages) { + WRITE_ONCE(cache->index[page->physical_page], cache->cache_slots); + wait_for_pending_searches(cache, page->physical_page); + } + + page->read_pending = true; + clear_cache_page(cache, page); + return page; +} + +/* Make a newly filled cache entry available to other threads. */ +static int +put_page_in_cache(struct page_cache *cache, u32 physical_page, struct cached_page *page) +{ + int result; + + /* We hold the read_threads_mutex. */ + result = ASSERT((page->read_pending), "page to install has a pending read"); + if (result != UDS_SUCCESS) + return result; + + page->physical_page = physical_page; + make_page_most_recent(cache, page); + page->read_pending = false; + + /* + * We hold the read_threads_mutex, but we must have a write memory barrier before making + * the cached_page available to the readers that do not hold the mutex. The corresponding + * read memory barrier is in get_page_and_index(). + */ + smp_wmb(); + + /* This assignment also clears the queued flag. */ + WRITE_ONCE(cache->index[physical_page], page - cache->cache); + return UDS_SUCCESS; +} + +static void cancel_page_in_cache(struct page_cache *cache, + u32 physical_page, + struct cached_page *page) +{ + int result; + + /* We hold the read_threads_mutex. */ + result = ASSERT((page->read_pending), "page to install has a pending read"); + if (result != UDS_SUCCESS) + return; + + clear_cache_page(cache, page); + page->read_pending = false; + + /* Clear the mapping and the queued flag for the new page. */ + WRITE_ONCE(cache->index[physical_page], cache->cache_slots); +} + +static inline u16 next_queue_position(u16 position) +{ + return (position + 1) % VOLUME_CACHE_MAX_QUEUED_READS; +} + +static inline void advance_queue_position(u16 *position) +{ + *position = next_queue_position(*position); +} + +static inline bool read_queue_is_full(struct page_cache *cache) +{ + return cache->read_queue_first == next_queue_position(cache->read_queue_last); +} + +static bool +enqueue_read(struct page_cache *cache, struct uds_request *request, u32 physical_page) +{ + struct queued_read *queue_entry; + u16 last = cache->read_queue_last; + u16 read_queue_index; + + /* We hold the read_threads_mutex. */ + if ((cache->index[physical_page] & VOLUME_CACHE_QUEUED_FLAG) == 0) { + /* This page has no existing entry in the queue. */ + if (read_queue_is_full(cache)) + return false; + + /* Fill in the read queue entry. */ + cache->read_queue[last].physical_page = physical_page; + cache->read_queue[last].invalid = false; + cache->read_queue[last].first_request = NULL; + cache->read_queue[last].last_request = NULL; + + /* Point the cache index to the read queue entry. */ + read_queue_index = last; + WRITE_ONCE(cache->index[physical_page], + read_queue_index | VOLUME_CACHE_QUEUED_FLAG); + + advance_queue_position(&cache->read_queue_last); + } else { + /* It's already queued, so add this request to the existing entry. */ + read_queue_index = cache->index[physical_page] & ~VOLUME_CACHE_QUEUED_FLAG; + } + + request->next_request = NULL; + queue_entry = &cache->read_queue[read_queue_index]; + if (queue_entry->first_request == NULL) + queue_entry->first_request = request; + else + queue_entry->last_request->next_request = request; + queue_entry->last_request = request; + + return true; +} + +static void +enqueue_page_read(struct volume *volume, struct uds_request *request, u32 physical_page) +{ + /* Mark the page as queued, so that chapter invalidation knows to cancel a read. */ + while (!enqueue_read(&volume->page_cache, request, physical_page)) { + uds_log_debug("Read queue full, waiting for reads to finish"); + uds_wait_cond(&volume->read_threads_read_done_cond, &volume->read_threads_mutex); + } + + uds_signal_cond(&volume->read_threads_cond); +} + +/* + * Reserve the next read queue entry for processing, but do not actually remove it from the queue. + * Must be followed by release_queued_requests(). + */ +static struct queued_read *reserve_read_queue_entry(struct page_cache *cache) +{ + /* We hold the read_threads_mutex. */ + struct queued_read *entry; + u16 index_value; + bool queued; + + /* No items to dequeue */ + if (cache->read_queue_next_read == cache->read_queue_last) + return NULL; + + entry = &cache->read_queue[cache->read_queue_next_read]; + index_value = cache->index[entry->physical_page]; + queued = (index_value & VOLUME_CACHE_QUEUED_FLAG) != 0; + /* Check to see if it's still queued before resetting. */ + if (entry->invalid && queued) + WRITE_ONCE(cache->index[entry->physical_page], cache->cache_slots); + + /* + * If a synchronous read has taken this page, set invalid to true so it doesn't get + * overwritten. Requests will just be requeued. + */ + if (!queued) + entry->invalid = true; + + entry->reserved = true; + advance_queue_position(&cache->read_queue_next_read); + return entry; +} + +static inline struct queued_read *wait_to_reserve_read_queue_entry(struct volume *volume) +{ + struct queued_read *queue_entry = NULL; + + while (!volume->read_threads_exiting) { + queue_entry = reserve_read_queue_entry(&volume->page_cache); + if (queue_entry != NULL) + break; + + uds_wait_cond(&volume->read_threads_cond, &volume->read_threads_mutex); + } + + return queue_entry; +} + +static int init_chapter_index_page(const struct volume *volume, + u8 *index_page, + u32 chapter, + u32 index_page_number, + struct delta_index_page *chapter_index_page) +{ + u64 ci_virtual; + u32 ci_chapter; + u32 lowest_list; + u32 highest_list; + struct geometry *geometry = volume->geometry; + int result; + + result = uds_initialize_chapter_index_page(chapter_index_page, + geometry, + index_page, + volume->nonce); + if (volume->lookup_mode == LOOKUP_FOR_REBUILD) + return result; + + if (result != UDS_SUCCESS) + return uds_log_error_strerror(result, + "Reading chapter index page for chapter %u page %u", + chapter, + index_page_number); + + uds_get_list_number_bounds(volume->index_page_map, + chapter, + index_page_number, + &lowest_list, + &highest_list); + ci_virtual = chapter_index_page->virtual_chapter_number; + ci_chapter = uds_map_to_physical_chapter(geometry, ci_virtual); + if ((chapter == ci_chapter) && + (lowest_list == chapter_index_page->lowest_list_number) && + (highest_list == chapter_index_page->highest_list_number)) + return UDS_SUCCESS; + + uds_log_warning("Index page map updated to %llu", + (unsigned long long) volume->index_page_map->last_update); + uds_log_warning("Page map expects that chapter %u page %u has range %u to %u, but chapter index page has chapter %llu with range %u to %u", + chapter, + index_page_number, + lowest_list, + highest_list, + (unsigned long long) ci_virtual, + chapter_index_page->lowest_list_number, + chapter_index_page->highest_list_number); + return uds_log_error_strerror(UDS_CORRUPT_DATA, + "index page map mismatch with chapter index"); +} + +static int initialize_index_page(const struct volume *volume, + u32 physical_page, + struct cached_page *page) +{ + u32 chapter = map_to_chapter_number(volume->geometry, physical_page); + u32 index_page_number = map_to_page_number(volume->geometry, physical_page); + + return init_chapter_index_page(volume, + dm_bufio_get_block_data(page->buffer), + chapter, + index_page_number, + &page->index_page); +} + +static bool +search_record_page(const u8 record_page[], + const struct uds_record_name *name, + const struct geometry *geometry, + struct uds_record_data *metadata) +{ + /* + * The array of records is sorted by name and stored as a binary tree in heap order, so the + * root of the tree is the first array element. + */ + u32 node = 0; + const struct uds_volume_record *records = (const struct uds_volume_record *) record_page; + + while (node < geometry->records_per_page) { + int result; + const struct uds_volume_record *record = &records[node]; + + result = memcmp(name, &record->name, UDS_RECORD_NAME_SIZE); + if (result == 0) { + if (metadata != NULL) + *metadata = record->data; + return true; + } + + /* The children of node N are at indexes 2N+1 and 2N+2. */ + node = ((2 * node) + ((result < 0) ? 1 : 2)); + } + + return false; +} + +/* + * If we've read in a record page, we're going to do an immediate search, to speed up processing by + * avoiding get_record_from_zone(), and to ensure that requests make progress even when queued. If + * we've read in an index page, we save the record page number so we don't have to resolve the + * index page again. We use the location, virtual_chapter, and old_metadata fields in the request + * to allow the index code to know where to begin processing the request again. + */ +static int search_page(struct cached_page *page, + const struct volume *volume, + struct uds_request *request, + u32 physical_page) +{ + int result; + enum uds_index_region location; + u16 record_page_number; + + if (is_record_page(volume->geometry, physical_page)) { + if (search_record_page(dm_bufio_get_block_data(page->buffer), + &request->record_name, + volume->geometry, + &request->old_metadata)) + location = UDS_LOCATION_RECORD_PAGE_LOOKUP; + else + location = UDS_LOCATION_UNAVAILABLE; + } else { + result = uds_search_chapter_index_page(&page->index_page, + volume->geometry, + &request->record_name, + &record_page_number); + if (result != UDS_SUCCESS) + return result; + + if (record_page_number == NO_CHAPTER_INDEX_ENTRY) { + location = UDS_LOCATION_UNAVAILABLE; + } else { + location = UDS_LOCATION_INDEX_PAGE_LOOKUP; + *((u16 *) &request->old_metadata) = record_page_number; + } + } + + request->location = location; + request->found = false; + return UDS_SUCCESS; +} + +static int process_entry(struct volume *volume, struct queued_read *entry) +{ + u32 page_number = entry->physical_page; + struct uds_request *request; + struct cached_page *page = NULL; + u8 *page_data; + int result; + + if (entry->invalid) { + uds_log_debug("Requeuing requests for invalid page"); + return UDS_SUCCESS; + } + + page = select_victim_in_cache(&volume->page_cache); + + uds_unlock_mutex(&volume->read_threads_mutex); + page_data = dm_bufio_read(volume->client, page_number, &page->buffer); + if (IS_ERR(page_data)) { + result = -PTR_ERR(page_data); + uds_log_warning_strerror(result, + "error reading physical page %u from volume", + page_number); + cancel_page_in_cache(&volume->page_cache, page_number, page); + return result; + } + uds_lock_mutex(&volume->read_threads_mutex); + + if (entry->invalid) { + uds_log_warning("Page %u invalidated after read", page_number); + cancel_page_in_cache(&volume->page_cache, page_number, page); + return UDS_SUCCESS; + } + + if (!is_record_page(volume->geometry, page_number)) { + result = initialize_index_page(volume, page_number, page); + if (result != UDS_SUCCESS) { + uds_log_warning("Error initializing chapter index page"); + cancel_page_in_cache(&volume->page_cache, page_number, page); + return result; + } + } + + result = put_page_in_cache(&volume->page_cache, page_number, page); + if (result != UDS_SUCCESS) { + uds_log_warning("Error putting page %u in cache", page_number); + cancel_page_in_cache(&volume->page_cache, page_number, page); + return result; + } + + request = entry->first_request; + while ((request != NULL) && (result == UDS_SUCCESS)) { + result = search_page(page, volume, request, page_number); + request = request->next_request; + } + + return result; +} + +static void release_queued_requests(struct volume *volume, struct queued_read *entry, int result) +{ + struct page_cache *cache = &volume->page_cache; + u16 next_read = cache->read_queue_next_read; + struct uds_request *request; + struct uds_request *next; + + for (request = entry->first_request; request != NULL; request = next) { + next = request->next_request; + request->status = result; + request->requeued = true; + uds_enqueue_request(request, STAGE_INDEX); + } + + entry->reserved = false; + + /* Move the read_queue_first pointer as far as we can. */ + while ((cache->read_queue_first != next_read) && + (!cache->read_queue[cache->read_queue_first].reserved)) + advance_queue_position(&cache->read_queue_first); + uds_broadcast_cond(&volume->read_threads_read_done_cond); +} + +static void read_thread_function(void *arg) +{ + struct volume *volume = arg; + + uds_log_debug("reader starting"); + uds_lock_mutex(&volume->read_threads_mutex); + while (true) { + struct queued_read *queue_entry; + int result; + + queue_entry = wait_to_reserve_read_queue_entry(volume); + if (volume->read_threads_exiting) + break; + + result = process_entry(volume, queue_entry); + release_queued_requests(volume, queue_entry, result); + } + uds_unlock_mutex(&volume->read_threads_mutex); + uds_log_debug("reader done"); +} + +static void get_page_and_index(struct page_cache *cache, + u32 physical_page, + int *queue_index, + struct cached_page **page_ptr) +{ + u16 index_value; + u16 index; + bool queued; + + /* + * ASSERTION: We are either a zone thread holding a search_pending_counter, or we are any + * thread holding the read_threads_mutex. + * + * Holding only a search_pending_counter is the most frequent case. + */ + /* + * It would be unlikely for the compiler to turn the usage of index_value into two reads of + * cache->index, but it would be possible and very bad if those reads did not return the + * same bits. + */ + index_value = READ_ONCE(cache->index[physical_page]); + queued = (index_value & VOLUME_CACHE_QUEUED_FLAG) != 0; + index = index_value & ~VOLUME_CACHE_QUEUED_FLAG; + + if (!queued && (index < cache->cache_slots)) { + *page_ptr = &cache->cache[index]; + /* + * We have acquired access to the cached page, but unless we hold the + * read_threads_mutex, we need a read memory barrier now. The corresponding write + * memory barrier is in put_page_in_cache(). + */ + smp_rmb(); + } else { + *page_ptr = NULL; + } + + *queue_index = queued ? index : -1; +} + +static void +get_page_from_cache(struct page_cache *cache, u32 physical_page, struct cached_page **page) +{ + /* + * ASSERTION: We are in a zone thread. + * ASSERTION: We holding a search_pending_counter or the read_threads_mutex. + */ + int queue_index = -1; + + get_page_and_index(cache, physical_page, &queue_index, page); +} + +static int read_page_locked(struct volume *volume, + u32 physical_page, + struct cached_page **page_ptr) +{ + int result = UDS_SUCCESS; + struct cached_page *page = NULL; + u8 *page_data; + + page = select_victim_in_cache(&volume->page_cache); + page_data = dm_bufio_read(volume->client, physical_page, &page->buffer); + if (IS_ERR(page_data)) { + result = -PTR_ERR(page_data); + uds_log_warning_strerror(result, + "error reading physical page %u from volume", + physical_page); + cancel_page_in_cache(&volume->page_cache, physical_page, page); + return result; + } + + if (!is_record_page(volume->geometry, physical_page)) { + result = initialize_index_page(volume, physical_page, page); + if (result != UDS_SUCCESS) { + if (volume->lookup_mode != LOOKUP_FOR_REBUILD) + uds_log_warning("Corrupt index page %u", physical_page); + cancel_page_in_cache(&volume->page_cache, physical_page, page); + return result; + } + } + + result = put_page_in_cache(&volume->page_cache, physical_page, page); + if (result != UDS_SUCCESS) { + uds_log_warning("Error putting page %u in cache", physical_page); + cancel_page_in_cache(&volume->page_cache, physical_page, page); + return result; + } + + *page_ptr = page; + return UDS_SUCCESS; +} + +/* Retrieve a page from the cache while holding the read threads mutex. */ +static int +get_volume_page_locked(struct volume *volume, u32 physical_page, struct cached_page **page_ptr) +{ + int result; + struct cached_page *page = NULL; + + get_page_from_cache(&volume->page_cache, physical_page, &page); + if (page == NULL) { + result = read_page_locked(volume, physical_page, &page); + if (result != UDS_SUCCESS) + return result; + } else { + make_page_most_recent(&volume->page_cache, page); + } + + *page_ptr = page; + return UDS_SUCCESS; +} + +/* Retrieve a page from the cache while holding a search_pending lock. */ +static int +get_volume_page_protected(struct volume *volume, + struct uds_request *request, + u32 physical_page, + struct cached_page **page_ptr) +{ + struct cached_page *page; + + get_page_from_cache(&volume->page_cache, physical_page, &page); + if (page != NULL) { + if (request->zone_number == 0) + /* Only one zone is allowed to update the LRU. */ + make_page_most_recent(&volume->page_cache, page); + + *page_ptr = page; + return UDS_SUCCESS; + } + + /* Prepare to enqueue a read for the page. */ + end_pending_search(&volume->page_cache, request->zone_number); + uds_lock_mutex(&volume->read_threads_mutex); + + /* + * Do the lookup again while holding the read mutex (no longer the fast case so this should + * be fine to repeat). We need to do this because a page may have been added to the cache + * by a reader thread between the time we searched above and the time we went to actually + * try to enqueue it below. This could result in us enqueuing another read for a page which + * is already in the cache, which would mean we end up with two entries in the cache for + * the same page. + */ + get_page_from_cache(&volume->page_cache, physical_page, &page); + if (page == NULL) { + enqueue_page_read(volume, request, physical_page); + /* + * The performance gain from unlocking first, while "search pending" mode is off, + * turns out to be significant in some cases. The page is not available yet so + * the order does not matter for correctness as it does below. + */ + uds_unlock_mutex(&volume->read_threads_mutex); + begin_pending_search(&volume->page_cache, physical_page, request->zone_number); + return UDS_QUEUED; + } + + /* + * Now that the page is loaded, the volume needs to switch to "reader thread unlocked" and + * "search pending" state in careful order so no other thread can mess with the data before + * the caller gets to look at it. + */ + begin_pending_search(&volume->page_cache, physical_page, request->zone_number); + uds_unlock_mutex(&volume->read_threads_mutex); + *page_ptr = page; + return UDS_SUCCESS; +} + +static int get_volume_page(struct volume *volume, + u32 chapter, + u32 page_number, + struct cached_page **page_ptr) +{ + int result; + u32 physical_page = map_to_physical_page(volume->geometry, chapter, page_number); + + uds_lock_mutex(&volume->read_threads_mutex); + result = get_volume_page_locked(volume, physical_page, page_ptr); + uds_unlock_mutex(&volume->read_threads_mutex); + return result; +} + +int uds_get_volume_record_page(struct volume *volume, u32 chapter, u32 page_number, u8 **data_ptr) +{ + int result; + struct cached_page *page = NULL; + + result = get_volume_page(volume, chapter, page_number, &page); + if (result == UDS_SUCCESS) + *data_ptr = dm_bufio_get_block_data(page->buffer); + return result; +} + +int uds_get_volume_index_page(struct volume *volume, + u32 chapter, + u32 page_number, + struct delta_index_page **index_page_ptr) +{ + int result; + struct cached_page *page = NULL; + + result = get_volume_page(volume, chapter, page_number, &page); + if (result == UDS_SUCCESS) + *index_page_ptr = &page->index_page; + return result; +} + +/* + * Find the record page associated with a name in a given index page. This will return UDS_QUEUED + * if the page in question must be read from storage. + */ +static int search_cached_index_page(struct volume *volume, + struct uds_request *request, + u32 chapter, + u32 index_page_number, + u16 *record_page_number) +{ + int result; + struct cached_page *page = NULL; + u32 physical_page = map_to_physical_page(volume->geometry, chapter, index_page_number); + + /* + * Make sure the invalidate counter is updated before we try and read the mapping. This + * prevents this thread from reading a page in the cache which has already been marked for + * invalidation by the reader thread, before the reader thread has noticed that the + * invalidate_counter has been incremented. + */ + begin_pending_search(&volume->page_cache, physical_page, request->zone_number); + + result = get_volume_page_protected(volume, request, physical_page, &page); + if (result != UDS_SUCCESS) { + end_pending_search(&volume->page_cache, request->zone_number); + return result; + } + + result = uds_search_chapter_index_page(&page->index_page, + volume->geometry, + &request->record_name, + record_page_number); + end_pending_search(&volume->page_cache, request->zone_number); + return result; +} + +/* + * Find the metadata associated with a name in a given record page. This will return UDS_QUEUED if + * the page in question must be read from storage. + */ +int uds_search_cached_record_page(struct volume *volume, + struct uds_request *request, + u32 chapter, + u16 record_page_number, + bool *found) +{ + struct cached_page *record_page; + struct geometry *geometry = volume->geometry; + int result; + u32 physical_page, page_number; + + *found = false; + if (record_page_number == NO_CHAPTER_INDEX_ENTRY) + return UDS_SUCCESS; + + result = ASSERT(record_page_number < geometry->record_pages_per_chapter, + "0 <= %d < %u", + record_page_number, + geometry->record_pages_per_chapter); + if (result != UDS_SUCCESS) + return result; + + page_number = geometry->index_pages_per_chapter + record_page_number; + + physical_page = map_to_physical_page(volume->geometry, chapter, page_number); + + /* + * Make sure the invalidate counter is updated before we try and read the mapping. This + * prevents this thread from reading a page in the cache which has already been marked for + * invalidation by the reader thread, before the reader thread has noticed that the + * invalidate_counter has been incremented. + */ + begin_pending_search(&volume->page_cache, physical_page, request->zone_number); + + result = get_volume_page_protected(volume, request, physical_page, &record_page); + if (result != UDS_SUCCESS) { + end_pending_search(&volume->page_cache, request->zone_number); + return result; + } + + if (search_record_page(dm_bufio_get_block_data(record_page->buffer), + &request->record_name, + geometry, + &request->old_metadata)) + *found = true; + + end_pending_search(&volume->page_cache, request->zone_number); + return UDS_SUCCESS; +} + +void uds_prefetch_volume_chapter(const struct volume *volume, u32 chapter) +{ + const struct geometry *geometry = volume->geometry; + u32 physical_page = map_to_physical_page(geometry, chapter, 0); + + dm_bufio_prefetch(volume->client, physical_page, geometry->pages_per_chapter); +} + +int uds_read_chapter_index_from_volume(const struct volume *volume, + u64 virtual_chapter, + struct dm_buffer *volume_buffers[], + struct delta_index_page index_pages[]) +{ + int result; + u32 i; + const struct geometry *geometry = volume->geometry; + u32 physical_chapter = uds_map_to_physical_chapter(geometry, virtual_chapter); + u32 physical_page = map_to_physical_page(geometry, physical_chapter, 0); + + dm_bufio_prefetch(volume->client, physical_page, geometry->index_pages_per_chapter); + for (i = 0; i < geometry->index_pages_per_chapter; i++) { + u8 *index_page; + + index_page = dm_bufio_read(volume->client, physical_page + i, &volume_buffers[i]); + if (IS_ERR(index_page)) { + result = -PTR_ERR(index_page); + uds_log_warning_strerror(result, + "error reading physical page %u", + physical_page); + return result; + } + + result = init_chapter_index_page(volume, + index_page, + physical_chapter, + i, + &index_pages[i]); + if (result != UDS_SUCCESS) + return result; + } + + return UDS_SUCCESS; +} + +int uds_search_volume_page_cache(struct volume *volume, struct uds_request *request, bool *found) +{ + int result; + u32 physical_chapter = + uds_map_to_physical_chapter(volume->geometry, request->virtual_chapter); + u32 index_page_number; + u16 record_page_number; + + index_page_number = uds_find_index_page_number(volume->index_page_map, + &request->record_name, + physical_chapter); + + if (request->location == UDS_LOCATION_INDEX_PAGE_LOOKUP) { + record_page_number = *((u16 *) &request->old_metadata); + } else { + result = search_cached_index_page(volume, + request, + physical_chapter, + index_page_number, + &record_page_number); + if (result != UDS_SUCCESS) + return result; + } + + return uds_search_cached_record_page(volume, + request, + physical_chapter, + record_page_number, + found); +} + +int uds_search_volume_page_cache_for_rebuild(struct volume *volume, + const struct uds_record_name *name, + u64 virtual_chapter, + bool *found) +{ + int result; + struct geometry *geometry = volume->geometry; + struct cached_page *page; + u32 physical_chapter = uds_map_to_physical_chapter(geometry, virtual_chapter); + u32 index_page_number; + u16 record_page_number; + u32 page_number; + + *found = false; + index_page_number = + uds_find_index_page_number(volume->index_page_map, name, physical_chapter); + result = get_volume_page(volume, physical_chapter, index_page_number, &page); + if (result != UDS_SUCCESS) + return result; + + result = uds_search_chapter_index_page(&page->index_page, + geometry, + name, + &record_page_number); + if (result != UDS_SUCCESS) + return result; + + if (record_page_number == NO_CHAPTER_INDEX_ENTRY) + return UDS_SUCCESS; + + page_number = geometry->index_pages_per_chapter + record_page_number; + result = get_volume_page(volume, physical_chapter, page_number, &page); + if (result != UDS_SUCCESS) + return result; + + *found = search_record_page(dm_bufio_get_block_data(page->buffer), name, geometry, NULL); + return UDS_SUCCESS; +} + +static void invalidate_page(struct page_cache *cache, u32 physical_page) +{ + struct cached_page *page; + int queue_index = -1; + + /* We hold the read_threads_mutex. */ + get_page_and_index(cache, physical_page, &queue_index, &page); + if (page != NULL) { + WRITE_ONCE(cache->index[page->physical_page], cache->cache_slots); + wait_for_pending_searches(cache, page->physical_page); + clear_cache_page(cache, page); + } else if (queue_index > -1) { + uds_log_debug("setting pending read to invalid"); + cache->read_queue[queue_index].invalid = true; + } +} + +void uds_forget_chapter(struct volume *volume, u64 virtual_chapter) +{ + u32 physical_chapter = uds_map_to_physical_chapter(volume->geometry, virtual_chapter); + u32 first_page = map_to_physical_page(volume->geometry, physical_chapter, 0); + u32 i; + + uds_log_debug("forgetting chapter %llu", (unsigned long long) virtual_chapter); + uds_lock_mutex(&volume->read_threads_mutex); + for (i = 0; i < volume->geometry->pages_per_chapter; i++) + invalidate_page(&volume->page_cache, first_page + i); + uds_unlock_mutex(&volume->read_threads_mutex); +} + +/* + * Donate an index pages from a newly written chapter to the page cache since it is likely to be + * used again soon. The caller must already hold the reader thread mutex. + */ +static int donate_index_page_locked(struct volume *volume, + u32 physical_chapter, + u32 index_page_number, + struct dm_buffer *page_buffer) +{ + int result; + struct cached_page *page = NULL; + u32 physical_page = + map_to_physical_page(volume->geometry, physical_chapter, index_page_number); + + page = select_victim_in_cache(&volume->page_cache); + page->buffer = page_buffer; + result = init_chapter_index_page(volume, + dm_bufio_get_block_data(page_buffer), + physical_chapter, + index_page_number, + &page->index_page); + if (result != UDS_SUCCESS) { + uds_log_warning("Error initialize chapter index page"); + cancel_page_in_cache(&volume->page_cache, physical_page, page); + return result; + } + + result = put_page_in_cache(&volume->page_cache, physical_page, page); + if (result != UDS_SUCCESS) { + uds_log_warning("Error putting page %u in cache", physical_page); + cancel_page_in_cache(&volume->page_cache, physical_page, page); + return result; + } + + return UDS_SUCCESS; +} + +static int write_index_pages(struct volume *volume, + u32 physical_chapter_number, + struct open_chapter_index *chapter_index) +{ + struct geometry *geometry = volume->geometry; + struct dm_buffer *page_buffer; + u32 first_index_page = map_to_physical_page(geometry, physical_chapter_number, 0); + u32 delta_list_number = 0; + u32 index_page_number; + + for (index_page_number = 0; + index_page_number < geometry->index_pages_per_chapter; + index_page_number++) { + u8 *page_data; + u32 physical_page = first_index_page + index_page_number; + u32 lists_packed; + bool last_page; + int result; + + page_data = dm_bufio_new(volume->client, physical_page, &page_buffer); + if (IS_ERR(page_data)) + return uds_log_warning_strerror(-PTR_ERR(page_data), + "failed to prepare index page"); + + last_page = ((index_page_number + 1) == geometry->index_pages_per_chapter); + result = uds_pack_open_chapter_index_page(chapter_index, + page_data, + delta_list_number, + last_page, + &lists_packed); + if (result != UDS_SUCCESS) { + dm_bufio_release(page_buffer); + return uds_log_warning_strerror(result, "failed to pack index page"); + } + + dm_bufio_mark_buffer_dirty(page_buffer); + + if (lists_packed == 0) + uds_log_debug("no delta lists packed on chapter %u page %u", + physical_chapter_number, + index_page_number); + else + delta_list_number += lists_packed; + + uds_update_index_page_map(volume->index_page_map, + chapter_index->virtual_chapter_number, + physical_chapter_number, + index_page_number, + delta_list_number - 1); + + uds_lock_mutex(&volume->read_threads_mutex); + result = donate_index_page_locked(volume, + physical_chapter_number, + index_page_number, + page_buffer); + uds_unlock_mutex(&volume->read_threads_mutex); + if (result != UDS_SUCCESS) { + dm_bufio_release(page_buffer); + return result; + } + } + + return UDS_SUCCESS; +} + +static u32 encode_tree(u8 record_page[], + const struct uds_volume_record *sorted_pointers[], + u32 next_record, + u32 node, + u32 node_count) +{ + if (node < node_count) { + u32 child = (2 * node) + 1; + + next_record = encode_tree(record_page, + sorted_pointers, + next_record, + child, + node_count); + + /* + * In-order traversal: copy the contents of the next record into the page at the + * node offset. + */ + memcpy(&record_page[node * BYTES_PER_RECORD], + sorted_pointers[next_record++], + BYTES_PER_RECORD); + + next_record = encode_tree(record_page, + sorted_pointers, + next_record, + child + 1, + node_count); + } + + return next_record; +} + +static int +encode_record_page(const struct volume *volume, + const struct uds_volume_record records[], + u8 record_page[]) +{ + int result; + u32 i; + u32 records_per_page = volume->geometry->records_per_page; + const struct uds_volume_record **record_pointers = volume->record_pointers; + + for (i = 0; i < records_per_page; i++) + record_pointers[i] = &records[i]; + + /* + * Sort the record pointers by using just the names in the records, which is less work than + * sorting the entire record values. + */ + STATIC_ASSERT(offsetof(struct uds_volume_record, name) == 0); + result = uds_radix_sort(volume->radix_sorter, + (const u8 **) record_pointers, + records_per_page, + UDS_RECORD_NAME_SIZE); + if (result != UDS_SUCCESS) + return result; + + encode_tree(record_page, record_pointers, 0, 0, records_per_page); + return UDS_SUCCESS; +} + +static int write_record_pages(struct volume *volume, + u32 physical_chapter_number, + const struct uds_volume_record *records) +{ + u32 record_page_number; + struct geometry *geometry = volume->geometry; + struct dm_buffer *page_buffer; + const struct uds_volume_record *next_record = records; + u32 first_record_page = map_to_physical_page(geometry, + physical_chapter_number, + geometry->index_pages_per_chapter); + + for (record_page_number = 0; + record_page_number < geometry->record_pages_per_chapter; + record_page_number++) { + u8 *page_data; + u32 physical_page = first_record_page + record_page_number; + int result; + + page_data = dm_bufio_new(volume->client, physical_page, &page_buffer); + if (IS_ERR(page_data)) + return uds_log_warning_strerror(-PTR_ERR(page_data), + "failed to prepare record page"); + + result = encode_record_page(volume, next_record, page_data); + if (result != UDS_SUCCESS) { + dm_bufio_release(page_buffer); + return uds_log_warning_strerror(result, + "failed to encode record page %u", + record_page_number); + } + + next_record += geometry->records_per_page; + dm_bufio_mark_buffer_dirty(page_buffer); + dm_bufio_release(page_buffer); + } + + return UDS_SUCCESS; +} + +int uds_write_chapter(struct volume *volume, + struct open_chapter_index *chapter_index, + const struct uds_volume_record *records) +{ + int result; + u32 physical_chapter_number = + uds_map_to_physical_chapter(volume->geometry, + chapter_index->virtual_chapter_number); + + result = write_index_pages(volume, physical_chapter_number, chapter_index); + if (result != UDS_SUCCESS) + return result; + + result = write_record_pages(volume, physical_chapter_number, records); + if (result != UDS_SUCCESS) + return result; + + result = -dm_bufio_write_dirty_buffers(volume->client); + if (result != UDS_SUCCESS) + uds_log_error_strerror(result, "cannot sync chapter to volume"); + + return result; +} + +static void probe_chapter(struct volume *volume, u32 chapter_number, u64 *virtual_chapter_number) +{ + const struct geometry *geometry = volume->geometry; + u32 expected_list_number = 0; + u32 i; + u64 vcn = BAD_CHAPTER; + + *virtual_chapter_number = BAD_CHAPTER; + dm_bufio_prefetch(volume->client, + map_to_physical_page(geometry, chapter_number, 0), + geometry->index_pages_per_chapter); + + for (i = 0; i < geometry->index_pages_per_chapter; ++i) { + struct delta_index_page *page; + int result; + + result = uds_get_volume_index_page(volume, chapter_number, i, &page); + if (result != UDS_SUCCESS) + return; + + if (page->virtual_chapter_number == BAD_CHAPTER) { + uds_log_error("corrupt index page in chapter %u", chapter_number); + return; + } + + if (vcn == BAD_CHAPTER) { + vcn = page->virtual_chapter_number; + } else if (page->virtual_chapter_number != vcn) { + uds_log_error("inconsistent chapter %u index page %u: expected vcn %llu, got vcn %llu", + chapter_number, + i, + (unsigned long long) vcn, + (unsigned long long) page->virtual_chapter_number); + return; + } + + if (expected_list_number != page->lowest_list_number) { + uds_log_error("inconsistent chapter %u index page %u: expected list number %u, got list number %u", + chapter_number, + i, + expected_list_number, + page->lowest_list_number); + return; + } + expected_list_number = page->highest_list_number + 1; + + result = uds_validate_chapter_index_page(page, geometry); + if (result != UDS_SUCCESS) + return; + } + + if (chapter_number != uds_map_to_physical_chapter(geometry, vcn)) { + uds_log_error("chapter %u vcn %llu is out of phase (%u)", + chapter_number, + (unsigned long long) vcn, + geometry->chapters_per_volume); + return; + } + + *virtual_chapter_number = vcn; +} + +/* Find the last valid physical chapter in the volume. */ +static void find_real_end_of_volume(struct volume *volume, u32 limit, u32 *limit_ptr) +{ + u32 span = 1; + u32 tries = 0; + + while (limit > 0) { + u32 chapter = (span > limit) ? 0 : limit - span; + u64 vcn = 0; + + probe_chapter(volume, chapter, &vcn); + if (vcn == BAD_CHAPTER) { + limit = chapter; + if (++tries > 1) + span *= 2; + } else { + if (span == 1) + break; + span /= 2; + tries = 0; + } + } + + *limit_ptr = limit; +} + +static int +find_chapter_limits(struct volume *volume, u32 chapter_limit, u64 *lowest_vcn, u64 *highest_vcn) +{ + struct geometry *geometry = volume->geometry; + u64 zero_vcn; + u64 lowest = BAD_CHAPTER; + u64 highest = BAD_CHAPTER; + u64 moved_chapter = BAD_CHAPTER; + u32 left_chapter = 0; + u32 right_chapter = 0; + u32 bad_chapters = 0; + + /* + * This method assumes there is at most one run of contiguous bad chapters caused by + * unflushed writes. Either the bad spot is at the beginning and end, or somewhere in the + * middle. Wherever it is, the highest and lowest VCNs are adjacent to it. Otherwise the + * volume is cleanly saved and somewhere in the middle of it the highest VCN immediately + * precedes the lowest one. + */ + + /* It doesn't matter if this results in a bad spot (BAD_CHAPTER). */ + probe_chapter(volume, 0, &zero_vcn); + + /* + * Binary search for end of the discontinuity in the monotonically increasing virtual + * chapter numbers; bad spots are treated as a span of BAD_CHAPTER values. In effect we're + * searching for the index of the smallest value less than zero_vcn. In the case we go off + * the end it means that chapter 0 has the lowest vcn. + * + * If a virtual chapter is out-of-order, it will be the one moved by conversion. Always + * skip over the moved chapter when searching, adding it to the range at the end if + * necessary. + */ + if (geometry->remapped_physical > 0) { + u64 remapped_vcn; + + probe_chapter(volume, geometry->remapped_physical, &remapped_vcn); + if (remapped_vcn == geometry->remapped_virtual) + moved_chapter = geometry->remapped_physical; + } + + left_chapter = 0; + right_chapter = chapter_limit; + + while (left_chapter < right_chapter) { + u64 probe_vcn; + u32 chapter = (left_chapter + right_chapter) / 2; + + if (chapter == moved_chapter) + chapter--; + + probe_chapter(volume, chapter, &probe_vcn); + if (zero_vcn <= probe_vcn) { + left_chapter = chapter + 1; + if (left_chapter == moved_chapter) + left_chapter++; + } else { + right_chapter = chapter; + } + } + + /* If left_chapter goes off the end, chapter 0 has the lowest virtual chapter number.*/ + if (left_chapter >= chapter_limit) + left_chapter = 0; + + /* At this point, left_chapter is the chapter with the lowest virtual chapter number. */ + probe_chapter(volume, left_chapter, &lowest); + + /* The moved chapter might be the lowest in the range. */ + if ((moved_chapter != BAD_CHAPTER) && (lowest == geometry->remapped_virtual + 1)) + lowest = geometry->remapped_virtual; + + /* + * Circularly scan backwards, moving over any bad chapters until encountering a good one, + * which is the chapter with the highest vcn. + */ + while (highest == BAD_CHAPTER) { + right_chapter = (right_chapter + chapter_limit - 1) % chapter_limit; + if (right_chapter == moved_chapter) + continue; + + probe_chapter(volume, right_chapter, &highest); + if (bad_chapters++ >= MAX_BAD_CHAPTERS) { + uds_log_error("too many bad chapters in volume: %u", bad_chapters); + return UDS_CORRUPT_DATA; + } + } + + *lowest_vcn = lowest; + *highest_vcn = highest; + return UDS_SUCCESS; +} + +/* + * Find the highest and lowest contiguous chapters present in the volume and determine their + * virtual chapter numbers. This is used by rebuild. + */ +int uds_find_volume_chapter_boundaries(struct volume *volume, + u64 *lowest_vcn, + u64 *highest_vcn, + bool *is_empty) +{ + u32 chapter_limit = volume->geometry->chapters_per_volume; + + find_real_end_of_volume(volume, chapter_limit, &chapter_limit); + if (chapter_limit == 0) { + *lowest_vcn = 0; + *highest_vcn = 0; + *is_empty = true; + return UDS_SUCCESS; + } + + *is_empty = false; + return find_chapter_limits(volume, chapter_limit, lowest_vcn, highest_vcn); +} + +int __must_check +uds_replace_volume_storage(struct volume *volume, struct index_layout *layout, const char *name) +{ + int result; + u32 i; + + result = uds_replace_index_layout_storage(layout, name); + if (result != UDS_SUCCESS) + return result; + + /* Release all outstanding dm_bufio objects */ + for (i = 0; i < volume->page_cache.indexable_pages; i++) + volume->page_cache.index[i] = volume->page_cache.cache_slots; + for (i = 0; i < volume->page_cache.cache_slots; i++) + clear_cache_page(&volume->page_cache, &volume->page_cache.cache[i]); + if (volume->sparse_cache != NULL) + uds_invalidate_sparse_cache(volume->sparse_cache); + if (volume->client != NULL) + dm_bufio_client_destroy(UDS_FORGET(volume->client)); + + return uds_open_volume_bufio(layout, + volume->geometry->bytes_per_page, + volume->reserved_buffers, + &volume->client); +} + +static int __must_check +initialize_page_cache(struct page_cache *cache, + const struct geometry *geometry, + u32 chapters_in_cache, + unsigned int zone_count) +{ + int result; + u32 i; + + cache->indexable_pages = geometry->pages_per_volume + 1; + cache->cache_slots = chapters_in_cache * geometry->record_pages_per_chapter; + cache->zone_count = zone_count; + atomic64_set(&cache->clock, 1); + + result = ASSERT((cache->cache_slots <= VOLUME_CACHE_MAX_ENTRIES), + "requested cache size, %u, within limit %u", + cache->cache_slots, + VOLUME_CACHE_MAX_ENTRIES); + if (result != UDS_SUCCESS) + return result; + + result = UDS_ALLOCATE(VOLUME_CACHE_MAX_QUEUED_READS, + struct queued_read, + "volume read queue", + &cache->read_queue); + if (result != UDS_SUCCESS) + return result; + + result = UDS_ALLOCATE(cache->zone_count, + struct search_pending_counter, + "Volume Cache Zones", + &cache->search_pending_counters); + if (result != UDS_SUCCESS) + return result; + + result = UDS_ALLOCATE(cache->indexable_pages, u16, "page cache index", &cache->index); + if (result != UDS_SUCCESS) + return result; + + result = UDS_ALLOCATE(cache->cache_slots, + struct cached_page, + "page cache cache", + &cache->cache); + if (result != UDS_SUCCESS) + return result; + + /* Initialize index values to invalid values. */ + for (i = 0; i < cache->indexable_pages; i++) + cache->index[i] = cache->cache_slots; + + for (i = 0; i < cache->cache_slots; i++) + clear_cache_page(cache, &cache->cache[i]); + + return UDS_SUCCESS; +} + +int uds_make_volume(const struct configuration *config, + struct index_layout *layout, + struct volume **new_volume) +{ + unsigned int i; + struct volume *volume = NULL; + struct geometry *geometry; + unsigned int reserved_buffers; + int result; + + result = UDS_ALLOCATE(1, struct volume, "volume", &volume); + if (result != UDS_SUCCESS) + return result; + + volume->nonce = uds_get_volume_nonce(layout); + + result = uds_copy_geometry(config->geometry, &volume->geometry); + if (result != UDS_SUCCESS) { + uds_free_volume(volume); + return uds_log_warning_strerror(result, "failed to allocate geometry: error"); + } + geometry = volume->geometry; + + /* + * Reserve a buffer for each entry in the page cache, one for the chapter writer, and one + * for each entry in the sparse cache. + */ + reserved_buffers = config->cache_chapters * geometry->record_pages_per_chapter; + reserved_buffers += 1; + if (uds_is_sparse_geometry(geometry)) + reserved_buffers += (config->cache_chapters * geometry->index_pages_per_chapter); + volume->reserved_buffers = reserved_buffers; + result = uds_open_volume_bufio(layout, + geometry->bytes_per_page, + volume->reserved_buffers, + &volume->client); + if (result != UDS_SUCCESS) { + uds_free_volume(volume); + return result; + } + + result = uds_make_radix_sorter(geometry->records_per_page, &volume->radix_sorter); + if (result != UDS_SUCCESS) { + uds_free_volume(volume); + return result; + } + + result = UDS_ALLOCATE(geometry->records_per_page, + const struct uds_volume_record *, + "record pointers", + &volume->record_pointers); + if (result != UDS_SUCCESS) { + uds_free_volume(volume); + return result; + } + + if (uds_is_sparse_geometry(geometry)) { + size_t page_size = sizeof(struct delta_index_page) + geometry->bytes_per_page; + + result = uds_make_sparse_cache(geometry, + config->cache_chapters, + config->zone_count, + &volume->sparse_cache); + if (result != UDS_SUCCESS) { + uds_free_volume(volume); + return result; + } + + volume->cache_size = + page_size * geometry->index_pages_per_chapter * config->cache_chapters; + } + + result = initialize_page_cache(&volume->page_cache, + geometry, + config->cache_chapters, + config->zone_count); + if (result != UDS_SUCCESS) { + uds_free_volume(volume); + return result; + } + + volume->cache_size += volume->page_cache.cache_slots * sizeof(struct delta_index_page); + result = uds_make_index_page_map(geometry, &volume->index_page_map); + if (result != UDS_SUCCESS) { + uds_free_volume(volume); + return result; + } + + result = uds_init_mutex(&volume->read_threads_mutex); + if (result != UDS_SUCCESS) { + uds_free_volume(volume); + return result; + } + + result = uds_init_cond(&volume->read_threads_read_done_cond); + if (result != UDS_SUCCESS) { + uds_free_volume(volume); + return result; + } + + result = uds_init_cond(&volume->read_threads_cond); + if (result != UDS_SUCCESS) { + uds_free_volume(volume); + return result; + } + + result = UDS_ALLOCATE(config->read_threads, + struct thread *, + "reader threads", + &volume->reader_threads); + if (result != UDS_SUCCESS) { + uds_free_volume(volume); + return result; + } + + for (i = 0; i < config->read_threads; i++) { + result = uds_create_thread(read_thread_function, + (void *) volume, + "reader", + &volume->reader_threads[i]); + if (result != UDS_SUCCESS) { + uds_free_volume(volume); + return result; + } + + volume->read_thread_count = i + 1; + } + + *new_volume = volume; + return UDS_SUCCESS; +} + +static void uninitialize_page_cache(struct page_cache *cache) +{ + u16 i; + + if (cache->cache != NULL) { + for (i = 0; i < cache->cache_slots; i++) + release_page_buffer(&cache->cache[i]); + } + UDS_FREE(cache->index); + UDS_FREE(cache->cache); + UDS_FREE(cache->search_pending_counters); + UDS_FREE(cache->read_queue); +} + +void uds_free_volume(struct volume *volume) +{ + if (volume == NULL) + return; + + if (volume->reader_threads != NULL) { + unsigned int i; + + /* This works even if some threads weren't started. */ + uds_lock_mutex(&volume->read_threads_mutex); + volume->read_threads_exiting = true; + uds_broadcast_cond(&volume->read_threads_cond); + uds_unlock_mutex(&volume->read_threads_mutex); + for (i = 0; i < volume->read_thread_count; i++) + uds_join_threads(volume->reader_threads[i]); + UDS_FREE(volume->reader_threads); + volume->reader_threads = NULL; + } + + /* Must destroy the client AFTER freeing the cached pages. */ + uninitialize_page_cache(&volume->page_cache); + uds_free_sparse_cache(volume->sparse_cache); + if (volume->client != NULL) + dm_bufio_client_destroy(UDS_FORGET(volume->client)); + + uds_destroy_cond(&volume->read_threads_cond); + uds_destroy_cond(&volume->read_threads_read_done_cond); + uds_destroy_mutex(&volume->read_threads_mutex); + uds_free_index_page_map(volume->index_page_map); + uds_free_radix_sorter(volume->radix_sorter); + UDS_FREE(volume->geometry); + UDS_FREE(volume->record_pointers); + UDS_FREE(volume); +} diff --git a/drivers/md/dm-vdo/volume.h b/drivers/md/dm-vdo/volume.h new file mode 100644 index 000000000000..b01541814ed7 --- /dev/null +++ b/drivers/md/dm-vdo/volume.h @@ -0,0 +1,174 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright 2023 Red Hat + */ + +#ifndef UDS_VOLUME_H +#define UDS_VOLUME_H + +#include <linux/atomic.h> +#include <linux/cache.h> +#include <linux/dm-bufio.h> +#include <linux/limits.h> + +#include "chapter-index.h" +#include "config.h" +#include "geometry.h" +#include "index-layout.h" +#include "index-page-map.h" +#include "permassert.h" +#include "radix-sort.h" +#include "sparse-cache.h" +#include "uds.h" +#include "uds-threads.h" + +/* + * The volume manages deduplication records on permanent storage. The term "volume" can also refer + * to the region of permanent storage where the records (and the chapters containing them) are + * stored. The volume handles all I/O to this region by reading, caching, and writing chapter pages + * as necessary. + */ + +enum index_lookup_mode { + /* Always do lookups in all chapters normally */ + LOOKUP_NORMAL, + /* Only do a subset of lookups needed when rebuilding an index */ + LOOKUP_FOR_REBUILD, +}; + +struct queued_read { + bool invalid; + bool reserved; + u32 physical_page; + struct uds_request *first_request; + struct uds_request *last_request; +}; + +struct __aligned(L1_CACHE_BYTES) search_pending_counter { + u64 atomic_value; +}; + +struct cached_page { + /* Whether this page is currently being read asynchronously */ + bool read_pending; + /* The physical page stored in this cache entry */ + u32 physical_page; + /* The value of the volume clock when this page was last used */ + s64 last_used; + /* The cached page buffer */ + struct dm_buffer *buffer; + /* The chapter index page, meaningless for record pages */ + struct delta_index_page index_page; +}; + +struct page_cache { + /* The number of zones */ + unsigned int zone_count; + /* The number of volume pages that can be cached */ + u32 indexable_pages; + /* The maximum number of simultaneously cached pages */ + u16 cache_slots; + /* An index for each physical page noting where it is in the cache */ + u16 *index; + /* The array of cached pages */ + struct cached_page *cache; + /* A counter for each zone tracking if a search is occurring there */ + struct search_pending_counter *search_pending_counters; + /* The read queue entries as a circular array */ + struct queued_read *read_queue; + + /* All entries above this point are constant after initialization. */ + + /* + * These values are all indexes into the array of read queue entries. New entries in the + * read queue are enqueued at read_queue_last. To dequeue entries, a reader thread gets the + * lock and then claims the entry pointed to by read_queue_next_read and increments that + * value. After the read is completed, the reader thread calls release_read_queue_entry(), + * which increments read_queue_first until it points to a pending read, or is equal to + * read_queue_next_read. This means that if multiple reads are outstanding, + * read_queue_first might not advance until the last of the reads finishes. + */ + u16 read_queue_first; + u16 read_queue_next_read; + u16 read_queue_last; + + atomic64_t clock; +}; + +struct volume { + struct geometry *geometry; + struct dm_bufio_client *client; + u64 nonce; + size_t cache_size; + + /* A single page worth of records, for sorting */ + const struct uds_volume_record **record_pointers; + /* Sorter for sorting records within each page */ + struct radix_sorter *radix_sorter; + + struct sparse_cache *sparse_cache; + struct page_cache page_cache; + struct index_page_map *index_page_map; + + struct mutex read_threads_mutex; + struct cond_var read_threads_cond; + struct cond_var read_threads_read_done_cond; + struct thread **reader_threads; + unsigned int read_thread_count; + bool read_threads_exiting; + + enum index_lookup_mode lookup_mode; + unsigned int reserved_buffers; +}; + +int __must_check uds_make_volume(const struct configuration *config, + struct index_layout *layout, + struct volume **new_volume); + +void uds_free_volume(struct volume *volume); + +int __must_check +uds_replace_volume_storage(struct volume *volume, struct index_layout *layout, const char *path); + +int __must_check uds_find_volume_chapter_boundaries(struct volume *volume, + u64 *lowest_vcn, + u64 *highest_vcn, + bool *is_empty); + +int __must_check +uds_search_volume_page_cache(struct volume *volume, struct uds_request *request, bool *found); + +int __must_check uds_search_volume_page_cache_for_rebuild(struct volume *volume, + const struct uds_record_name *name, + u64 virtual_chapter, + bool *found); + +int __must_check uds_search_cached_record_page(struct volume *volume, + struct uds_request *request, + u32 chapter, + u16 record_page_number, + bool *found); + +void uds_forget_chapter(struct volume *volume, u64 chapter); + +int __must_check uds_write_chapter(struct volume *volume, + struct open_chapter_index *chapter_index, + const struct uds_volume_record records[]); + +void uds_prefetch_volume_chapter(const struct volume *volume, u32 chapter); + +int __must_check +uds_read_chapter_index_from_volume(const struct volume *volume, + u64 virtual_chapter, + struct dm_buffer *volume_buffers[], + struct delta_index_page index_pages[]); + +int __must_check +uds_get_volume_record_page(struct volume *volume, u32 chapter, u32 page_number, u8 **data_ptr); + +int __must_check uds_get_volume_index_page(struct volume *volume, + u32 chapter, + u32 page_number, + struct delta_index_page **page_ptr); + +#endif /* UDS_VOLUME_H */

[v3,14/39] dm vdo: implement the chapter volume store

Commit Message

Patch