diff mbox series

[04/16] reftable/block: simplify how we track restart points

Message ID 20250331-pks-reftable-polishing-v1-4-ebed5247434c@pks.im (mailing list archive)
State New
Headers show
Series reftable: overhaul the API to expose access to blocks | expand

Commit Message

Patrick Steinhardt March 31, 2025, 8:41 a.m. UTC
Restart points record the location of reftable records that do not use
prefix compression and are used to perform a binary search inside of a
block. These restart points are encoded at the end of a block, between
the record data and the footer of a table.

The block structure contains three different variables related to these
restart points:

  - The block length contains the length of the reftable block up to the
    restart points.

  - The restart count contains the number of restart points contained in
    the block.

  - The restart bytes variable tracks where the restart point data
    begins.

Tracking all three of these variables is unnecessary though as the data
can be derived from one another: the block length without restart points
is the exact same as the offset of the restart count data, which we
already track via the `restart_bytes` data.

Refactor the code so that we track the location of restart bytes not as
a pointer, but instead as an offset. This allows us to trivially get rid
of the `block_len` variable as described above. This avoids having the
confusing `block_len` variable and allows us to do less bookkeeping
overall.

Signed-off-by: Patrick Steinhardt <ps@pks.im>
---
 reftable/block.c | 25 ++++++++++++-------------
 reftable/block.h |  8 +++++---
 reftable/table.c |  2 +-
 3 files changed, 18 insertions(+), 17 deletions(-)

Comments

Justin Tobler April 2, 2025, 6:08 p.m. UTC | #1
On 25/03/31 10:41AM, Patrick Steinhardt wrote:
> Restart points record the location of reftable records that do not use
> prefix compression and are used to perform a binary search inside of a
> block. These restart points are encoded at the end of a block, between
> the record data and the footer of a table.
> 
> The block structure contains three different variables related to these
> restart points:
> 
>   - The block length contains the length of the reftable block up to the
>     restart points.
> 
>   - The restart count contains the number of restart points contained in
>     the block.
> 
>   - The restart bytes variable tracks where the restart point data
>     begins.
> 
> Tracking all three of these variables is unnecessary though as the data
> can be derived from one another: the block length without restart points
> is the exact same as the offset of the restart count data, which we
> already track via the `restart_bytes` data.
> 
> Refactor the code so that we track the location of restart bytes not as
> a pointer, but instead as an offset. This allows us to trivially get rid
> of the `block_len` variable as described above. This avoids having the
> confusing `block_len` variable and allows us to do less bookkeeping
> overall.
> 
> Signed-off-by: Patrick Steinhardt <ps@pks.im>
> ---
>  reftable/block.c | 25 ++++++++++++-------------
>  reftable/block.h |  8 +++++---
>  reftable/table.c |  2 +-
>  3 files changed, 18 insertions(+), 17 deletions(-)
> 
> diff --git a/reftable/block.c b/reftable/block.c
> index 97740187259..f2567a8f0fd 100644
> --- a/reftable/block.c
> +++ b/reftable/block.c
> @@ -216,10 +216,9 @@ int block_reader_init(struct block_reader *br, struct reftable_block *block,
>  	uint32_t full_block_size = table_block_size;
>  	uint8_t typ = block->data[header_off];
>  	uint32_t sz = reftable_get_be24(block->data + header_off + 1);
> -	int err = 0;
> -	uint16_t restart_count = 0;
> -	uint32_t restart_start = 0;
> -	uint8_t *restart_bytes = NULL;
> +	uint16_t restart_count;
> +	uint32_t restart_off;
> +	int err;
>  
>  	block_source_return_block(&br->block);
>  
> @@ -300,8 +299,7 @@ int block_reader_init(struct block_reader *br, struct reftable_block *block,
>  	}
>  
>  	restart_count = reftable_get_be16(block->data + sz - 2);
> -	restart_start = sz - 2 - 3 * restart_count;
> -	restart_bytes = block->data + restart_start;
> +	restart_off = sz - 2 - 3 * restart_count;

Ok, since `restart_bytes` is derived from `restart_start`, instead of
having both, we just track the offset which is `restart_start` and rename
it to `restart_off`. Make sense.

>  	/* transfer ownership. */
>  	br->block = *block;
> @@ -309,11 +307,12 @@ int block_reader_init(struct block_reader *br, struct reftable_block *block,
>  	block->len = 0;
>  
>  	br->hash_size = hash_size;
> -	br->block_len = restart_start;
> +	br->restart_off = restart_off;

The `block_len` is always the size of the restart offset. I assume we
rename it here to keep naming more consistent.

The remainder of this patch looks good.

>  	br->full_block_size = full_block_size;
>  	br->header_off = header_off;
>  	br->restart_count = restart_count;
> -	br->restart_bytes = restart_bytes;
> +
> +	err = 0;
>  
>  done:
>  	return err;
> @@ -337,7 +336,7 @@ int block_reader_first_key(const struct block_reader *br, struct reftable_buf *k
>  	int off = br->header_off + 4, n;
>  	struct string_view in = {
>  		.buf = br->block.data + off,
> -		.len = br->block_len - off,
> +		.len = br->restart_off - off,
>  	};
>  	uint8_t extra = 0;
>  
> @@ -354,13 +353,13 @@ int block_reader_first_key(const struct block_reader *br, struct reftable_buf *k
>  
>  static uint32_t block_reader_restart_offset(const struct block_reader *br, size_t idx)
>  {
> -	return reftable_get_be24(br->restart_bytes + 3 * idx);
> +	return reftable_get_be24(br->block.data + br->restart_off + 3 * idx);
>  }
>  
>  void block_iter_seek_start(struct block_iter *it, const struct block_reader *br)
>  {
>  	it->block = br->block.data;
> -	it->block_len = br->block_len;
> +	it->block_len = br->restart_off;
>  	it->hash_size = br->hash_size;
>  	reftable_buf_reset(&it->last_key);
>  	it->next_off = br->header_off + 4;
> @@ -378,7 +377,7 @@ static int restart_needle_less(size_t idx, void *_args)
>  	uint32_t off = block_reader_restart_offset(args->reader, idx);
>  	struct string_view in = {
>  		.buf = args->reader->block.data + off,
> -		.len = args->reader->block_len - off,
> +		.len = args->reader->restart_off - off,
>  	};
>  	uint64_t prefix_len, suffix_len;
>  	uint8_t extra;
> @@ -505,7 +504,7 @@ int block_iter_seek_key(struct block_iter *it, const struct block_reader *br,
>  	else
>  		it->next_off = br->header_off + 4;
>  	it->block = br->block.data;
> -	it->block_len = br->block_len;
> +	it->block_len = br->restart_off;
>  	it->hash_size = br->hash_size;
>  
>  	err = reftable_record_init(&rec, block_reader_type(br));
> diff --git a/reftable/block.h b/reftable/block.h
> index 203b07d9a44..b78f322e646 100644
> --- a/reftable/block.h
> +++ b/reftable/block.h
> @@ -79,10 +79,12 @@ struct block_reader {
>  	unsigned char *uncompressed_data;
>  	size_t uncompressed_cap;
>  
> -	/* size of the data, excluding restart data. */
> -	uint32_t block_len;
> -	uint8_t *restart_bytes;
> +	/*
> +	 * Restart point data. Restart points are located after the block's
> +	 * record data.
> +	 */
>  	uint16_t restart_count;
> +	uint32_t restart_off;
>  
>  	/* size of the data in the file. For log blocks, this is the compressed
>  	 * size. */
> diff --git a/reftable/table.c b/reftable/table.c
> index d18e17b0d44..ec84545707c 100644
> --- a/reftable/table.c
> +++ b/reftable/table.c
> @@ -838,7 +838,7 @@ int reftable_table_print_blocks(const char *tablename)
>  		printf("%s:\n", sections[i].name);
>  
>  		while (1) {
> -			printf("  - length: %u\n", ti.br.block_len);
> +			printf("  - length: %u\n", ti.br.restart_off);
>  			printf("    restarts: %u\n", ti.br.restart_count);
>  
>  			err = table_iter_next_block(&ti);
> 
> -- 
> 2.49.0.604.gff1f9ca942.dirty
> 
>
diff mbox series

Patch

diff --git a/reftable/block.c b/reftable/block.c
index 97740187259..f2567a8f0fd 100644
--- a/reftable/block.c
+++ b/reftable/block.c
@@ -216,10 +216,9 @@  int block_reader_init(struct block_reader *br, struct reftable_block *block,
 	uint32_t full_block_size = table_block_size;
 	uint8_t typ = block->data[header_off];
 	uint32_t sz = reftable_get_be24(block->data + header_off + 1);
-	int err = 0;
-	uint16_t restart_count = 0;
-	uint32_t restart_start = 0;
-	uint8_t *restart_bytes = NULL;
+	uint16_t restart_count;
+	uint32_t restart_off;
+	int err;
 
 	block_source_return_block(&br->block);
 
@@ -300,8 +299,7 @@  int block_reader_init(struct block_reader *br, struct reftable_block *block,
 	}
 
 	restart_count = reftable_get_be16(block->data + sz - 2);
-	restart_start = sz - 2 - 3 * restart_count;
-	restart_bytes = block->data + restart_start;
+	restart_off = sz - 2 - 3 * restart_count;
 
 	/* transfer ownership. */
 	br->block = *block;
@@ -309,11 +307,12 @@  int block_reader_init(struct block_reader *br, struct reftable_block *block,
 	block->len = 0;
 
 	br->hash_size = hash_size;
-	br->block_len = restart_start;
+	br->restart_off = restart_off;
 	br->full_block_size = full_block_size;
 	br->header_off = header_off;
 	br->restart_count = restart_count;
-	br->restart_bytes = restart_bytes;
+
+	err = 0;
 
 done:
 	return err;
@@ -337,7 +336,7 @@  int block_reader_first_key(const struct block_reader *br, struct reftable_buf *k
 	int off = br->header_off + 4, n;
 	struct string_view in = {
 		.buf = br->block.data + off,
-		.len = br->block_len - off,
+		.len = br->restart_off - off,
 	};
 	uint8_t extra = 0;
 
@@ -354,13 +353,13 @@  int block_reader_first_key(const struct block_reader *br, struct reftable_buf *k
 
 static uint32_t block_reader_restart_offset(const struct block_reader *br, size_t idx)
 {
-	return reftable_get_be24(br->restart_bytes + 3 * idx);
+	return reftable_get_be24(br->block.data + br->restart_off + 3 * idx);
 }
 
 void block_iter_seek_start(struct block_iter *it, const struct block_reader *br)
 {
 	it->block = br->block.data;
-	it->block_len = br->block_len;
+	it->block_len = br->restart_off;
 	it->hash_size = br->hash_size;
 	reftable_buf_reset(&it->last_key);
 	it->next_off = br->header_off + 4;
@@ -378,7 +377,7 @@  static int restart_needle_less(size_t idx, void *_args)
 	uint32_t off = block_reader_restart_offset(args->reader, idx);
 	struct string_view in = {
 		.buf = args->reader->block.data + off,
-		.len = args->reader->block_len - off,
+		.len = args->reader->restart_off - off,
 	};
 	uint64_t prefix_len, suffix_len;
 	uint8_t extra;
@@ -505,7 +504,7 @@  int block_iter_seek_key(struct block_iter *it, const struct block_reader *br,
 	else
 		it->next_off = br->header_off + 4;
 	it->block = br->block.data;
-	it->block_len = br->block_len;
+	it->block_len = br->restart_off;
 	it->hash_size = br->hash_size;
 
 	err = reftable_record_init(&rec, block_reader_type(br));
diff --git a/reftable/block.h b/reftable/block.h
index 203b07d9a44..b78f322e646 100644
--- a/reftable/block.h
+++ b/reftable/block.h
@@ -79,10 +79,12 @@  struct block_reader {
 	unsigned char *uncompressed_data;
 	size_t uncompressed_cap;
 
-	/* size of the data, excluding restart data. */
-	uint32_t block_len;
-	uint8_t *restart_bytes;
+	/*
+	 * Restart point data. Restart points are located after the block's
+	 * record data.
+	 */
 	uint16_t restart_count;
+	uint32_t restart_off;
 
 	/* size of the data in the file. For log blocks, this is the compressed
 	 * size. */
diff --git a/reftable/table.c b/reftable/table.c
index d18e17b0d44..ec84545707c 100644
--- a/reftable/table.c
+++ b/reftable/table.c
@@ -838,7 +838,7 @@  int reftable_table_print_blocks(const char *tablename)
 		printf("%s:\n", sections[i].name);
 
 		while (1) {
-			printf("  - length: %u\n", ti.br.block_len);
+			printf("  - length: %u\n", ti.br.restart_off);
 			printf("    restarts: %u\n", ti.br.restart_count);
 
 			err = table_iter_next_block(&ti);