diff mbox series

[v5,2/6] pack-bitmap-write.c: write lookup table extension

Message ID a913e6a2cb36d8ec7900b60820d8ab3c35f60164.1658342304.git.gitgitgadget@gmail.com (mailing list archive)
State New, archived
Headers show
Series bitmap: integrate a lookup table extension to the bitmap format | expand

Commit Message

Abhradeep Chakraborty July 20, 2022, 6:38 p.m. UTC
From: Abhradeep Chakraborty <chakrabortyabhradeep79@gmail.com>

The bitmap lookup table extension was documented by an earlier
change, but Git does not yet know how to write that extension.

Teach Git to write bitmap lookup table extension. The table contains
the list of `N` <commit_pos, offset, xor_row>` triplets. These
triplets are sorted according to their commit pos (ascending order).
The meaning of each data in the i'th triplet is given below:

  - commit_pos stores commit position (in the pack-index or midx).
    It is a 4 byte network byte order unsigned integer.

  - offset is the position (in the bitmap file) from which that
    commit's bitmap can be read.

  - xor_row is the position of the triplet in the lookup table
    whose bitmap is used to compress this bitmap, or `0xffffffff`
    if no such bitmap exists.

Mentored-by: Taylor Blau <me@ttaylorr.com>
Co-mentored-by: Kaartic Sivaraam <kaartic.sivaraam@gmail.com>
Co-authored-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Abhradeep Chakraborty <chakrabortyabhradeep79@gmail.com>
---
 pack-bitmap-write.c | 112 ++++++++++++++++++++++++++++++++++++++++----
 pack-bitmap.h       |   5 +-
 2 files changed, 107 insertions(+), 10 deletions(-)

Comments

Taylor Blau July 26, 2022, 12:52 a.m. UTC | #1
On Wed, Jul 20, 2022 at 06:38:20PM +0000, Abhradeep Chakraborty via GitGitGadget wrote:
> From: Abhradeep Chakraborty <chakrabortyabhradeep79@gmail.com>
>
> The bitmap lookup table extension was documented by an earlier
> change, but Git does not yet know how to write that extension.
>
> Teach Git to write bitmap lookup table extension. The table contains
> the list of `N` <commit_pos, offset, xor_row>` triplets. These
> triplets are sorted according to their commit pos (ascending order).
> The meaning of each data in the i'th triplet is given below:
>
>   - commit_pos stores commit position (in the pack-index or midx).
>     It is a 4 byte network byte order unsigned integer.
>
>   - offset is the position (in the bitmap file) from which that
>     commit's bitmap can be read.
>
>   - xor_row is the position of the triplet in the lookup table
>     whose bitmap is used to compress this bitmap, or `0xffffffff`
>     if no such bitmap exists.
>
> Mentored-by: Taylor Blau <me@ttaylorr.com>
> Co-mentored-by: Kaartic Sivaraam <kaartic.sivaraam@gmail.com>
> Co-authored-by: Taylor Blau <me@ttaylorr.com>
> Signed-off-by: Abhradeep Chakraborty <chakrabortyabhradeep79@gmail.com>
> ---
>  pack-bitmap-write.c | 112 ++++++++++++++++++++++++++++++++++++++++----
>  pack-bitmap.h       |   5 +-
>  2 files changed, 107 insertions(+), 10 deletions(-)
>
> diff --git a/pack-bitmap-write.c b/pack-bitmap-write.c
> index c43375bd344..9843790cb60 100644
> --- a/pack-bitmap-write.c
> +++ b/pack-bitmap-write.c
> @@ -650,20 +650,19 @@ static const struct object_id *oid_access(size_t pos, const void *table)
>
>  static void write_selected_commits_v1(struct hashfile *f,
>  				      struct pack_idx_entry **index,
> -				      uint32_t index_nr)
> +				      uint32_t index_nr,
> +				      off_t *offsets,
> +				      uint32_t *commit_positions)
>  {
>  	int i;
>
>  	for (i = 0; i < writer.selected_nr; ++i) {
>  		struct bitmapped_commit *stored = &writer.selected[i];
>
> -		int commit_pos =
> -			oid_pos(&stored->commit->object.oid, index, index_nr, oid_access);
> +		if (offsets)
> +			offsets[i] = hashfile_total(f);
>
> -		if (commit_pos < 0)
> -			BUG("trying to write commit not in index");
> -
> -		hashwrite_be32(f, commit_pos);
> +		hashwrite_be32(f, commit_positions[i]);

I wonder if it would make this patch a little more readable to construct
and use the commit_positions array as a single preparatory step before
this commit.

What do you think?

> +static void write_lookup_table(struct hashfile *f,
> +			       struct pack_idx_entry **index,
> +			       uint32_t index_nr,
> +			       off_t *offsets,
> +			       uint32_t *commit_positions)
> +{
> +	uint32_t i;
> +	uint32_t *table, *table_inv;
> +
> +	ALLOC_ARRAY(table, writer.selected_nr);
> +	ALLOC_ARRAY(table_inv, writer.selected_nr);
> +
> +	for (i = 0; i < writer.selected_nr; i++)
> +		table[i] = i;
> +
> +	/*
> +	 * At the end of this sort table[j] = i means that the i'th
> +	 * bitmap corresponds to j'th bitmapped commit (among the selected
> +	 * commits) in lex order of OIDs.
> +	 */
> +	QSORT_S(table, writer.selected_nr, table_cmp, commit_positions);
> +
> +	/* table_inv helps us discover that relationship (i'th bitmap
> +	 * to j'th commit by j = table_inv[i])
> +	 */
> +	for (i = 0; i < writer.selected_nr; i++)
> +		table_inv[table[i]] = i;
> +
> +	trace2_region_enter("pack-bitmap-write", "writing_lookup_table", the_repository);
> +	for (i = 0; i < writer.selected_nr; i++) {
> +		struct bitmapped_commit *selected = &writer.selected[table[i]];
> +		uint32_t xor_offset = selected->xor_offset;
> +		uint32_t xor_row;
> +
> +		if (xor_offset) {
> +			/*
> +			 * xor_index stores the index (in the bitmap entries)
> +			 * of the corresponding xor bitmap. But we need to convert
> +			 * this index into lookup table's index. So, table_inv[xor_index]
> +			 * gives us the index position w.r.t. the lookup table.
> +			 *
> +			 * If "k = table[i] - xor_offset" then the xor base is the k'th
> +			 * bitmap. `table_inv[k]` gives us the position of that bitmap
> +			 * in the lookup table.
> +			 */
> +			uint32_t xor_index = table[i] - xor_offset;
> +			xor_row = table_inv[xor_index];
> +		} else {
> +			xor_row = 0xffffffff;
> +		}
> +
> +		hashwrite_be32(f, commit_positions[table[i]]);
> +		hashwrite_be64(f, (uint64_t)offsets[table[i]]);
> +		hashwrite_be32(f, xor_row);
> +	}
> +	trace2_region_leave("pack-bitmap-write", "writing_lookup_table", the_repository);
> +
> +	free(table);
> +	free(table_inv);
> +}


> @@ -715,7 +791,25 @@ void bitmap_writer_finish(struct pack_idx_entry **index,
>  	dump_bitmap(f, writer.trees);
>  	dump_bitmap(f, writer.blobs);
>  	dump_bitmap(f, writer.tags);
> -	write_selected_commits_v1(f, index, index_nr);
> +
> +	ALLOC_ARRAY(commit_positions, writer.selected_nr);
> +	for (uint32_t i = 0; i < writer.selected_nr; ++i) {

Nit; we don't typically write for-loop expressions with variable
declarations inside of them. Make sure to declare i outside of the loop,
and then this becomes:

    for (i = 0; i < writer.selected_nr; i++)

(also, we typically use the postfix ++ operator, that is "i++" instead
of "++i" unless there is a reason to prefer the latter over the former).

Thanks,
Taylor
Abhradeep Chakraborty July 26, 2022, 6:22 p.m. UTC | #2
On Tue, Jul 26, 2022 at 6:22 AM Taylor Blau <me@ttaylorr.com> wrote:
> I wonder if it would make this patch a little more readable to construct
> and use the commit_positions array as a single preparatory step before
> this commit.
>
> What do you think?

Yeah, sure! I have no problem with that.
> > +
> > +     ALLOC_ARRAY(commit_positions, writer.selected_nr);
> > +     for (uint32_t i = 0; i < writer.selected_nr; ++i) {
>
> Nit; we don't typically write for-loop expressions with variable
> declarations inside of them. Make sure to declare i outside of the loop,
> and then this becomes:
>
>     for (i = 0; i < writer.selected_nr; i++)
>
> (also, we typically use the postfix ++ operator, that is "i++" instead
> of "++i" unless there is a reason to prefer the latter over the former).

Got it. Thanks :)
diff mbox series

Patch

diff --git a/pack-bitmap-write.c b/pack-bitmap-write.c
index c43375bd344..9843790cb60 100644
--- a/pack-bitmap-write.c
+++ b/pack-bitmap-write.c
@@ -650,20 +650,19 @@  static const struct object_id *oid_access(size_t pos, const void *table)
 
 static void write_selected_commits_v1(struct hashfile *f,
 				      struct pack_idx_entry **index,
-				      uint32_t index_nr)
+				      uint32_t index_nr,
+				      off_t *offsets,
+				      uint32_t *commit_positions)
 {
 	int i;
 
 	for (i = 0; i < writer.selected_nr; ++i) {
 		struct bitmapped_commit *stored = &writer.selected[i];
 
-		int commit_pos =
-			oid_pos(&stored->commit->object.oid, index, index_nr, oid_access);
+		if (offsets)
+			offsets[i] = hashfile_total(f);
 
-		if (commit_pos < 0)
-			BUG("trying to write commit not in index");
-
-		hashwrite_be32(f, commit_pos);
+		hashwrite_be32(f, commit_positions[i]);
 		hashwrite_u8(f, stored->xor_offset);
 		hashwrite_u8(f, stored->flags);
 
@@ -671,6 +670,81 @@  static void write_selected_commits_v1(struct hashfile *f,
 	}
 }
 
+static int table_cmp(const void *_va, const void *_vb, void *_data)
+{
+	uint32_t *commit_positions = _data;
+	uint32_t a = commit_positions[*(uint32_t *)_va];
+	uint32_t b = commit_positions[*(uint32_t *)_vb];
+
+	if (a > b)
+		return 1;
+	else if (a < b)
+		return -1;
+
+	return 0;
+}
+
+static void write_lookup_table(struct hashfile *f,
+			       struct pack_idx_entry **index,
+			       uint32_t index_nr,
+			       off_t *offsets,
+			       uint32_t *commit_positions)
+{
+	uint32_t i;
+	uint32_t *table, *table_inv;
+
+	ALLOC_ARRAY(table, writer.selected_nr);
+	ALLOC_ARRAY(table_inv, writer.selected_nr);
+
+	for (i = 0; i < writer.selected_nr; i++)
+		table[i] = i;
+
+	/*
+	 * At the end of this sort table[j] = i means that the i'th
+	 * bitmap corresponds to j'th bitmapped commit (among the selected
+	 * commits) in lex order of OIDs.
+	 */
+	QSORT_S(table, writer.selected_nr, table_cmp, commit_positions);
+
+	/* table_inv helps us discover that relationship (i'th bitmap
+	 * to j'th commit by j = table_inv[i])
+	 */
+	for (i = 0; i < writer.selected_nr; i++)
+		table_inv[table[i]] = i;
+
+	trace2_region_enter("pack-bitmap-write", "writing_lookup_table", the_repository);
+	for (i = 0; i < writer.selected_nr; i++) {
+		struct bitmapped_commit *selected = &writer.selected[table[i]];
+		uint32_t xor_offset = selected->xor_offset;
+		uint32_t xor_row;
+
+		if (xor_offset) {
+			/*
+			 * xor_index stores the index (in the bitmap entries)
+			 * of the corresponding xor bitmap. But we need to convert
+			 * this index into lookup table's index. So, table_inv[xor_index]
+			 * gives us the index position w.r.t. the lookup table.
+			 *
+			 * If "k = table[i] - xor_offset" then the xor base is the k'th
+			 * bitmap. `table_inv[k]` gives us the position of that bitmap
+			 * in the lookup table.
+			 */
+			uint32_t xor_index = table[i] - xor_offset;
+			xor_row = table_inv[xor_index];
+		} else {
+			xor_row = 0xffffffff;
+		}
+
+		hashwrite_be32(f, commit_positions[table[i]]);
+		hashwrite_be64(f, (uint64_t)offsets[table[i]]);
+		hashwrite_be32(f, xor_row);
+	}
+	trace2_region_leave("pack-bitmap-write", "writing_lookup_table", the_repository);
+
+	free(table);
+	free(table_inv);
+}
+
 static void write_hash_cache(struct hashfile *f,
 			     struct pack_idx_entry **index,
 			     uint32_t index_nr)
@@ -695,8 +769,10 @@  void bitmap_writer_finish(struct pack_idx_entry **index,
 {
 	static uint16_t default_version = 1;
 	static uint16_t flags = BITMAP_OPT_FULL_DAG;
+	off_t *offsets = NULL;
 	struct strbuf tmp_file = STRBUF_INIT;
 	struct hashfile *f;
+	uint32_t *commit_positions = NULL;
 
 	struct bitmap_disk_header header;
 
@@ -715,7 +791,25 @@  void bitmap_writer_finish(struct pack_idx_entry **index,
 	dump_bitmap(f, writer.trees);
 	dump_bitmap(f, writer.blobs);
 	dump_bitmap(f, writer.tags);
-	write_selected_commits_v1(f, index, index_nr);
+
+	ALLOC_ARRAY(commit_positions, writer.selected_nr);
+	for (uint32_t i = 0; i < writer.selected_nr; ++i) {
+		struct bitmapped_commit *stored = &writer.selected[i];
+		int commit_pos = oid_pos(&stored->commit->object.oid, index, index_nr, oid_access);
+
+		if (commit_pos < 0)
+			BUG(_("trying to write commit not in index"));
+
+		commit_positions[i] = commit_pos;
+	}
+
+	if (options & BITMAP_OPT_LOOKUP_TABLE)
+		CALLOC_ARRAY(offsets, index_nr);
+
+	write_selected_commits_v1(f, index, index_nr, offsets, commit_positions);
+
+	if (options & BITMAP_OPT_LOOKUP_TABLE)
+		write_lookup_table(f, index, index_nr, offsets, commit_positions);
 
 	if (options & BITMAP_OPT_HASH_CACHE)
 		write_hash_cache(f, index, index_nr);
@@ -730,4 +824,6 @@  void bitmap_writer_finish(struct pack_idx_entry **index,
 		die_errno("unable to rename temporary bitmap file to '%s'", filename);
 
 	strbuf_release(&tmp_file);
+	free(offsets);
+	free(commit_positions);
 }
diff --git a/pack-bitmap.h b/pack-bitmap.h
index 3d3ddd77345..67a9d0fc303 100644
--- a/pack-bitmap.h
+++ b/pack-bitmap.h
@@ -24,8 +24,9 @@  struct bitmap_disk_header {
 #define NEEDS_BITMAP (1u<<22)
 
 enum pack_bitmap_opts {
-	BITMAP_OPT_FULL_DAG = 1,
-	BITMAP_OPT_HASH_CACHE = 4,
+	BITMAP_OPT_FULL_DAG = 0x1,
+	BITMAP_OPT_HASH_CACHE = 0x4,
+	BITMAP_OPT_LOOKUP_TABLE = 0x10,
 };
 
 enum pack_bitmap_flags {