diff mbox series

[1/2] One filter per commit

Message ID ebf0a811be047e9fd24b61fea3c4a164b3d18dc0.1539219248.git.jonathantanmy@google.com (mailing list archive)
State New, archived
Headers show
Series Per-commit filter proof of concept | expand

Commit Message

Jonathan Tan Oct. 11, 2018, 1:21 a.m. UTC
Signed-off-by: Jonathan Tan <jonathantanmy@google.com>
---
 bloom-filter.c | 31 ++++++++++++++++++-------------
 bloom-filter.h | 12 ++++--------
 commit-graph.c | 26 ++++++++++++--------------
 revision.c     |  9 +++------
 4 files changed, 37 insertions(+), 41 deletions(-)

Comments

Derrick Stolee Oct. 11, 2018, 12:49 p.m. UTC | #1
On 10/10/2018 9:21 PM, Jonathan Tan wrote:
> diff --git a/commit-graph.c b/commit-graph.c
> index f415d3b41f..90b0b3df90 100644
> --- a/commit-graph.c
> +++ b/commit-graph.c
> @@ -715,13 +715,11 @@ static int add_ref_to_list(const char *refname,
>   static void add_changes_to_bloom_filter(struct bloom_filter *bf,
>   					struct commit *parent,
>   					struct commit *commit,
> +					int index,
>   					struct diff_options *diffopt)
>   {
> -	unsigned char p_c_hash[GIT_MAX_RAWSZ];
>   	int i;
>   
> -	hashxor(parent->object.oid.hash, commit->object.oid.hash, p_c_hash);
> -
>   	diff_tree_oid(&parent->object.oid, &commit->object.oid, "", diffopt);
>   	diffcore_std(diffopt);
>   
> @@ -756,8 +754,8 @@ static void add_changes_to_bloom_filter(struct bloom_filter *bf,
>   			the_hash_algo->update_fn(&ctx, path, p - path);
>   			the_hash_algo->final_fn(name_hash, &ctx);
>   
> -			hashxor(name_hash, p_c_hash, hash);
> -			bloom_filter_add_hash(bf, hash);
> +			hashxor(name_hash, parent->object.oid.hash, hash);
> +			bloom_filter_add_hash(bf, index, hash);
>   		} while (*p);
>   
>   		diff_free_filepair(diff_queued_diff.queue[i]);
[snip]
> @@ -768,11 +766,10 @@ static void add_changes_to_bloom_filter(struct bloom_filter *bf,
>   }
>   
>   static void fill_bloom_filter(struct bloom_filter *bf,
> -				    struct progress *progress)
> +				    struct progress *progress, struct commit **commits, int commit_nr)
>   {
>   	struct rev_info revs;
>   	const char *revs_argv[] = {NULL, "--all", NULL};
> -	struct commit *commit;
>   	int i = 0;
>   
>   	/* We (re-)create the bloom filter from scratch every time for now. */
> @@ -783,18 +780,19 @@ static void fill_bloom_filter(struct bloom_filter *bf,
>   	if (prepare_revision_walk(&revs))
>   		die("revision walk setup failed while preparing bloom filter");
>   
> -	while ((commit = get_revision(&revs))) {
> +	for (i = 0; i < commit_nr; i++) {
> +		struct commit *commit = commits[i];
>   		struct commit_list *parent;
>   
>   		for (parent = commit->parents; parent; parent = parent->next)
> -			add_changes_to_bloom_filter(bf, parent->item, commit,
> +			add_changes_to_bloom_filter(bf, parent->item, commit, i,
>   						    &revs.diffopt);
>   
[snip]
>   
> -		hashxor(pi->name_hash, p_c_hash, hash);
> -		if (bloom_filter_check_hash(&bf, hash)) {
> +		hashxor(pi->name_hash, parent->object.oid.hash, hash);
> +		if (bloom_filter_check_hash(&bf, commit->graph_pos, hash)) {
>   			/*
>   			 * At least one of the interesting pathspecs differs,
>   			 * so we can return early and let the diff machinery
One main benefit of storing on Bloom filter per commit is to avoid 
recomputing hashes at every commit. Currently, this patch only improves 
locality when checking membership at the cost of taking up more space. 
Drop the dependence on the parent oid and then we can save the time 
spent hashing during history queries.

-Stolee
diff mbox series

Patch

diff --git a/bloom-filter.c b/bloom-filter.c
index 7dce0e35fa..39b453908f 100644
--- a/bloom-filter.c
+++ b/bloom-filter.c
@@ -1,14 +1,17 @@ 
 #include "cache.h"
 #include "bloom-filter.h"
 
-void bloom_filter_init(struct bloom_filter *bf, uint32_t bit_size)
+void bloom_filter_init(struct bloom_filter *bf, uint32_t commit_nr, uint32_t bit_size)
 {
 	if (bit_size % CHAR_BIT)
 		BUG("invalid size for bloom filter");
+	if (bit_size > 1024)
+		BUG("aborting: the bit size is per commit, not for the whole filter");
 
 	bf->nr_entries = 0;
+	bf->commit_nr = commit_nr;
 	bf->bit_size = bit_size;
-	bf->bits = xmalloc(bit_size / CHAR_BIT);
+	bf->bits = xcalloc(1, commit_nr * bit_size / CHAR_BIT);
 }
 
 void bloom_filter_free(struct bloom_filter *bf)
@@ -19,24 +22,24 @@  void bloom_filter_free(struct bloom_filter *bf)
 }
 
 
-void bloom_filter_set_bits(struct bloom_filter *bf, const uint32_t *offsets,
+static void bloom_filter_set_bits(struct bloom_filter *bf, uint32_t graph_pos, const uint32_t *offsets,
 			   int nr_offsets, int nr_entries)
 {
 	int i;
 	for (i = 0; i < nr_offsets; i++) {
-		uint32_t byte_offset = (offsets[i] % bf->bit_size) / CHAR_BIT;
+		uint32_t byte_offset = (offsets[i] % bf->bit_size + graph_pos * bf->bit_size) / CHAR_BIT;
 		unsigned char mask = 1 << offsets[i] % CHAR_BIT;
 		bf->bits[byte_offset] |= mask;
 	}
 	bf->nr_entries += nr_entries;
 }
 
-int bloom_filter_check_bits(struct bloom_filter *bf, const uint32_t *offsets,
+static int bloom_filter_check_bits(struct bloom_filter *bf, uint32_t graph_pos, const uint32_t *offsets,
 			    int nr)
 {
 	int i;
 	for (i = 0; i < nr; i++) {
-		uint32_t byte_offset = (offsets[i] % bf->bit_size) / CHAR_BIT;
+		uint32_t byte_offset = (offsets[i] % bf->bit_size + graph_pos * bf->bit_size) / CHAR_BIT;
 		unsigned char mask = 1 << offsets[i] % CHAR_BIT;
 		if (!(bf->bits[byte_offset] & mask))
 			return 0;
@@ -45,19 +48,19 @@  int bloom_filter_check_bits(struct bloom_filter *bf, const uint32_t *offsets,
 }
 
 
-void bloom_filter_add_hash(struct bloom_filter *bf, const unsigned char *hash)
+void bloom_filter_add_hash(struct bloom_filter *bf, uint32_t graph_pos, const unsigned char *hash)
 {
 	uint32_t offsets[GIT_MAX_RAWSZ / sizeof(uint32_t)];
 	hashcpy((unsigned char*)offsets, hash);
-	bloom_filter_set_bits(bf, offsets,
+	bloom_filter_set_bits(bf, graph_pos, offsets,
 			     the_hash_algo->rawsz / sizeof(*offsets), 1);
 }
 
-int bloom_filter_check_hash(struct bloom_filter *bf, const unsigned char *hash)
+int bloom_filter_check_hash(struct bloom_filter *bf, uint32_t graph_pos, const unsigned char *hash)
 {
 	uint32_t offsets[GIT_MAX_RAWSZ / sizeof(uint32_t)];
 	hashcpy((unsigned char*)offsets, hash);
-	return bloom_filter_check_bits(bf, offsets,
+	return bloom_filter_check_bits(bf, graph_pos, offsets,
 			the_hash_algo->rawsz / sizeof(*offsets));
 }
 
@@ -80,11 +83,12 @@  int bloom_filter_load(struct bloom_filter *bf)
 		return -1;
 
 	read_in_full(fd, &bf->nr_entries, sizeof(bf->nr_entries));
+	read_in_full(fd, &bf->commit_nr, sizeof(bf->commit_nr));
 	read_in_full(fd, &bf->bit_size, sizeof(bf->bit_size));
 	if (bf->bit_size % CHAR_BIT)
 		BUG("invalid size for bloom filter");
-	bf->bits = xmalloc(bf->bit_size / CHAR_BIT);
-	read_in_full(fd, bf->bits, bf->bit_size / CHAR_BIT);
+	bf->bits = xmalloc(bf->commit_nr * bf->bit_size / CHAR_BIT);
+	read_in_full(fd, bf->bits, bf->commit_nr * bf->bit_size / CHAR_BIT);
 
 	close(fd);
 
@@ -96,8 +100,9 @@  void bloom_filter_write(struct bloom_filter *bf)
 	int fd = xopen(git_path_bloom(), O_WRONLY | O_CREAT | O_TRUNC, 0666);
 
 	write_in_full(fd, &bf->nr_entries, sizeof(bf->nr_entries));
+	write_in_full(fd, &bf->commit_nr, sizeof(bf->commit_nr));
 	write_in_full(fd, &bf->bit_size, sizeof(bf->bit_size));
-	write_in_full(fd, bf->bits, bf->bit_size / CHAR_BIT);
+	write_in_full(fd, bf->bits, bf->commit_nr * bf->bit_size / CHAR_BIT);
 
 	close(fd);
 }
diff --git a/bloom-filter.h b/bloom-filter.h
index 94d0af1708..607649b8db 100644
--- a/bloom-filter.h
+++ b/bloom-filter.h
@@ -5,30 +5,26 @@ 
 
 struct bloom_filter {
 	uint32_t nr_entries;
+	uint32_t commit_nr;
 	uint32_t bit_size;
 	unsigned char *bits;
 };
 
 
-void bloom_filter_init(struct bloom_filter *bf, uint32_t bit_size);
+void bloom_filter_init(struct bloom_filter *bf, uint32_t commit_nr, uint32_t bit_size);
 void bloom_filter_free(struct bloom_filter *bf);
 
-void bloom_filter_set_bits(struct bloom_filter *bf, const uint32_t *offsets,
-			   int nr_offsets, int nr_enries);
-int bloom_filter_check_bits(struct bloom_filter *bf, const uint32_t *offsets,
-			    int nr);
-
 /*
  * Turns the given (SHA1) hash into 5 unsigned ints, and sets the bits at
  * those positions (modulo the bitmap's size) in the Bloom filter.
  */
-void bloom_filter_add_hash(struct bloom_filter *bf, const unsigned char *hash);
+void bloom_filter_add_hash(struct bloom_filter *bf, uint32_t graph_pos, const unsigned char *hash);
 /*
  * Turns the given (SHA1) hash into 5 unsigned ints, and checks the bits at
  * those positions (modulo the bitmap's size) in the Bloom filter.
  * Returns 1 if all those bits are set, 0 otherwise.
  */
-int bloom_filter_check_hash(struct bloom_filter *bf, const unsigned char *hash);
+int bloom_filter_check_hash(struct bloom_filter *bf, uint32_t graph_pos, const unsigned char *hash);
 
 void hashxor(const unsigned char *hash1, const unsigned char *hash2,
 	     unsigned char *out);
diff --git a/commit-graph.c b/commit-graph.c
index f415d3b41f..90b0b3df90 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -715,13 +715,11 @@  static int add_ref_to_list(const char *refname,
 static void add_changes_to_bloom_filter(struct bloom_filter *bf,
 					struct commit *parent,
 					struct commit *commit,
+					int index,
 					struct diff_options *diffopt)
 {
-	unsigned char p_c_hash[GIT_MAX_RAWSZ];
 	int i;
 
-	hashxor(parent->object.oid.hash, commit->object.oid.hash, p_c_hash);
-
 	diff_tree_oid(&parent->object.oid, &commit->object.oid, "", diffopt);
 	diffcore_std(diffopt);
 
@@ -756,8 +754,8 @@  static void add_changes_to_bloom_filter(struct bloom_filter *bf,
 			the_hash_algo->update_fn(&ctx, path, p - path);
 			the_hash_algo->final_fn(name_hash, &ctx);
 
-			hashxor(name_hash, p_c_hash, hash);
-			bloom_filter_add_hash(bf, hash);
+			hashxor(name_hash, parent->object.oid.hash, hash);
+			bloom_filter_add_hash(bf, index, hash);
 		} while (*p);
 
 		diff_free_filepair(diff_queued_diff.queue[i]);
@@ -768,11 +766,10 @@  static void add_changes_to_bloom_filter(struct bloom_filter *bf,
 }
 
 static void fill_bloom_filter(struct bloom_filter *bf,
-				    struct progress *progress)
+				    struct progress *progress, struct commit **commits, int commit_nr)
 {
 	struct rev_info revs;
 	const char *revs_argv[] = {NULL, "--all", NULL};
-	struct commit *commit;
 	int i = 0;
 
 	/* We (re-)create the bloom filter from scratch every time for now. */
@@ -783,18 +780,19 @@  static void fill_bloom_filter(struct bloom_filter *bf,
 	if (prepare_revision_walk(&revs))
 		die("revision walk setup failed while preparing bloom filter");
 
-	while ((commit = get_revision(&revs))) {
+	for (i = 0; i < commit_nr; i++) {
+		struct commit *commit = commits[i];
 		struct commit_list *parent;
 
 		for (parent = commit->parents; parent; parent = parent->next)
-			add_changes_to_bloom_filter(bf, parent->item, commit,
+			add_changes_to_bloom_filter(bf, parent->item, commit, i,
 						    &revs.diffopt);
 
-		display_progress(progress, ++i);
+		display_progress(progress, i);
 	}
 }
 
-static void write_bloom_filter(int report_progress, int commit_nr)
+static void write_bloom_filter(int report_progress, struct commit **commits, int commit_nr)
 {
 	struct bloom_filter bf;
 	struct progress *progress = NULL;
@@ -809,13 +807,13 @@  static void write_bloom_filter(int report_progress, int commit_nr)
 	if (*end)
 		die("GIT_USE_POC_BLOOM_FILTER must specify the number of bits in the bloom filter (multiple of 8, n < 2^32)");
 
-	bloom_filter_init(&bf, bitsize);
+	bloom_filter_init(&bf, commit_nr, bitsize);
 
 	if (report_progress)
 		progress = start_progress(_("Computing bloom filter"),
 					  commit_nr);
 
-	fill_bloom_filter(&bf, progress);
+	fill_bloom_filter(&bf, progress, commits, commit_nr);
 
 	bloom_filter_write(&bf);
 	bloom_filter_free(&bf);
@@ -1030,7 +1028,7 @@  void write_commit_graph(const char *obj_dir,
 	finalize_hashfile(f, NULL, CSUM_HASH_IN_STREAM | CSUM_FSYNC);
 	commit_lock_file(&lk);
 
-	write_bloom_filter(report_progress, commits.nr);
+	write_bloom_filter(report_progress, commits.list, commits.nr);
 
 	free(graph_name);
 	free(commits.list);
diff --git a/revision.c b/revision.c
index d5ba2b1598..c84a997928 100644
--- a/revision.c
+++ b/revision.c
@@ -490,7 +490,6 @@  static int check_maybe_different_in_bloom_filter(struct rev_info *revs,
 						 struct commit *parent,
 						 struct commit *commit)
 {
-	unsigned char p_c_hash[GIT_MAX_RAWSZ];
 	int i;
 
 	if (!bf.bits)
@@ -509,17 +508,15 @@  static int check_maybe_different_in_bloom_filter(struct rev_info *revs,
 	 */
 	if (!the_repository->objects->commit_graph)
 		return -1;
-	if (commit->generation == GENERATION_NUMBER_INFINITY)
+	if (commit->graph_pos == COMMIT_NOT_FROM_GRAPH)
 		return -1;
 
-	hashxor(parent->object.oid.hash, commit->object.oid.hash, p_c_hash);
-
 	for (i = 0; i < revs->pruning.pathspec.nr; i++) {
 		struct pathspec_item *pi = &revs->pruning.pathspec.items[i];
 		unsigned char hash[GIT_MAX_RAWSZ];
 
-		hashxor(pi->name_hash, p_c_hash, hash);
-		if (bloom_filter_check_hash(&bf, hash)) {
+		hashxor(pi->name_hash, parent->object.oid.hash, hash);
+		if (bloom_filter_check_hash(&bf, commit->graph_pos, hash)) {
 			/*
 			 * At least one of the interesting pathspecs differs,
 			 * so we can return early and let the diff machinery