@@ -997,6 +997,326 @@ static void pass_blame_to_parent(struct blame_scoreboard *sb,
return;
}
+/* https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel */
+static int bitcount(uint32_t v) {
+ v = v - ((v >> 1) & 0x55555555u);
+ v = (v & 0x33333333u) + ((v >> 2) & 0x33333333u);
+ return ((v + (v >> 4) & 0xf0f0f0fu) * 0x1010101u) >> 24;
+}
+
+#define FINGERPRINT_LENGTH (8*256)
+/* This is just a bitset indicating which byte pairs are present.
+ e.g. the string "good goo" has pairs "go", "oo", "od", "d ", " g"
+ String similarity is calculated as a bitwise or and counting the set bits.
+ TODO for the string lengths we typically deal with, this would probably be
+ implemented more efficiently with a set data structure.
+ */
+struct fingerprint {
+ uint32_t bits[FINGERPRINT_LENGTH];
+};
+
+static void get_fingerprint(struct fingerprint *result,
+ const char *line_begin, const char *line_end) {
+ memset(result, 0, sizeof(struct fingerprint));
+ for (const char *p = line_begin; p + 1 < line_end; ++p) {
+ unsigned c = tolower(*p) | (tolower(*(p + 1)) << 8);
+ result->bits[c >> 5] |= 1u << (c & 0x1f);
+ }
+}
+
+static int fingerprint_similarity(const struct fingerprint *a,
+ const struct fingerprint *b) {
+ int intersection = 0;
+ for (int i = 0; i < FINGERPRINT_LENGTH; ++i) {
+ intersection += bitcount(a->bits[i] & b->bits[i]);
+ }
+ return intersection;
+}
+
+struct fuzzy_blame_parent_data {
+ struct blame_origin *parent;
+ long offset;
+ struct blame_entry **processed_entries;
+ struct blame_entry **target_entries;
+ const char *parent_content;
+ const char *target_content;
+ int *parent_line_starts;
+ int *target_line_starts;
+ int parent_line_count;
+ int target_line_count;
+};
+
+static void get_chunk_fingerprints(struct fingerprint *fingerprints,
+ const char *content,
+ const int *line_starts,
+ long chunk_start,
+ long chunk_length) {
+ line_starts += chunk_start;
+ for (int i = 0; i != chunk_length; ++i) {
+ const char* linestart = content + line_starts[i];
+ const char* lineend = content + line_starts[i + 1];
+ get_fingerprint(fingerprints + i, linestart, lineend);
+ }
+}
+
+/* This finds the line that we can match with the most confidence, and
+ uses it as a partition. It then calls itself on the lines on either side of
+ that partition. In this way we avoid lines appearing out of order, and retain
+ a sensible line ordering.
+ TODO: so much optimisation. Currently this does the same work repeatedly.
+ */
+static void fuzzy_find_matching_lines_recurse(
+ const char *content_a, const char *content_b,
+ const int *line_starts_a, const int *line_starts_b,
+ int start_a, int start_b,
+ int length_a, int length_b,
+ int *result,
+ struct fingerprint *fingerprints_a,
+ struct fingerprint *fingerprints_b) {
+
+ int most_certain_line = -1;
+ int most_certain_line_certainty = -1;
+
+ for (int i = 0; i < length_b; ++i) {
+ const struct fingerprint *fingerprint_b = fingerprints_b + i;
+
+ int closest_line_a = (i * 2 + 1) * length_a /
+ (length_b * 2);
+
+ /* Limit range of search to a reasonable number of lines.
+ TODO consider scaling this up if length_a is greater than
+ length_b. */
+ const int MAX_SEARCH_DISTANCE = 5;
+ int search_start = closest_line_a - (MAX_SEARCH_DISTANCE - 1);
+ int search_end = closest_line_a + MAX_SEARCH_DISTANCE;
+ if (search_start < 0) search_start = 0;
+ if (search_end > length_a) search_end = length_a;
+
+ /* Find both the best and 2nd best matches. The match certainty
+ is the difference between these values. */
+ int best_similarity = 0, second_best_similarity = 0;
+ int best_similarity_index = 0;
+
+ for (int j = search_start; j < search_end; ++j) {
+ int similarity = fingerprint_similarity(
+ fingerprint_b,
+ fingerprints_a + j) *
+ (1000 - abs(j - closest_line_a));
+
+ if (similarity > best_similarity) {
+ second_best_similarity = best_similarity;
+ best_similarity = similarity;
+ best_similarity_index = j;
+ }
+ else if (similarity > second_best_similarity) {
+ second_best_similarity = similarity;
+ }
+ }
+
+ if (best_similarity == 0) {
+ result[i] = -1;
+ continue;
+ }
+
+ result[i] = start_a + best_similarity_index;
+
+ int certainty = best_similarity - second_best_similarity;
+ if (certainty > most_certain_line_certainty) {
+ most_certain_line_certainty = certainty;
+ most_certain_line = i;
+ }
+ }
+
+ if (most_certain_line == -1) {
+ return;
+ }
+
+ if (most_certain_line > 0) {
+ fuzzy_find_matching_lines_recurse(content_a, content_b, line_starts_a, line_starts_b, start_a, start_b, result[most_certain_line] + 1 - start_a, most_certain_line, result, fingerprints_a, fingerprints_b);
+ }
+ if (most_certain_line + 1 < length_b) {
+ int second_half_start_a = result[most_certain_line];
+ int second_half_start_b = start_b + most_certain_line + 1;
+ int second_half_length_a = length_a + start_a - second_half_start_a;
+ int second_half_length_b = length_b + start_b - second_half_start_b;
+ fuzzy_find_matching_lines_recurse(content_a, content_b, line_starts_a, line_starts_b, second_half_start_a, second_half_start_b, second_half_length_a, second_half_length_b, result + most_certain_line + 1, fingerprints_a + second_half_start_a - start_a, fingerprints_b + most_certain_line + 1);
+ }
+}
+
+/* Find line numbers in "a" that match with lines in "b"
+ Returns an array of either line indices or -1 where no match is found.
+ The returned array must be free()d after use.
+ */
+static int *fuzzy_find_matching_lines(
+ const char *content_a, const char *content_b,
+ const int *line_starts_a, const int *line_starts_b,
+ int start_a, int start_b,
+ int length_a, int length_b) {
+
+ int *result = malloc(sizeof(int) * length_b);
+
+ struct fingerprint *fingerprints_a =
+ malloc(sizeof(struct fingerprint) * length_a);
+ struct fingerprint *fingerprints_b =
+ malloc(sizeof(struct fingerprint) * length_b);
+
+ get_chunk_fingerprints(fingerprints_a, content_a,
+ line_starts_a,
+ start_a, length_a);
+ get_chunk_fingerprints(fingerprints_b, content_b,
+ line_starts_b,
+ start_b, length_b);
+
+ fuzzy_find_matching_lines_recurse(content_a, content_b,
+ line_starts_a, line_starts_b,
+ start_a, start_b,
+ length_a, length_b,
+ result,
+ fingerprints_a,
+ fingerprints_b);
+
+ free(fingerprints_a);
+ free(fingerprints_b);
+
+ return result;
+}
+
+static int blame_chunk_fuzzy(long parent_chunk_start,
+ long parent_chunk_length,
+ long target_chunk_start,
+ long target_chunk_length,
+ void *data)
+{
+ struct fuzzy_blame_parent_data *d = data;
+
+ if (parent_chunk_start - target_chunk_start != d->offset)
+ die("internal error in blame::blame_chunk_fuzzy");
+
+ int target_chunk_end = target_chunk_start + target_chunk_length;
+
+ struct blame_origin *parent = d->parent;
+ struct blame_entry *e = *d->target_entries;
+ struct blame_entry *parent_tail = NULL;
+ struct blame_entry *target_tail = NULL;
+
+ if (parent_chunk_length == 0) {
+ /* Don't try to blame parent for newly added lines */
+ while (e && e->s_lno < target_chunk_end) {
+ target_tail = e;
+ e = e->next;
+ }
+ d->target_entries = &target_tail->next;
+ goto finish;
+ }
+
+ int *matched_lines = fuzzy_find_matching_lines(
+ d->parent_content, d->target_content,
+ d->parent_line_starts, d->target_line_starts,
+ parent_chunk_start, target_chunk_start,
+ parent_chunk_length, target_chunk_length);
+
+ while (e && e->s_lno < target_chunk_end) {
+ struct blame_entry *next = e->next;
+
+ for (int i = 0; i < e->num_lines; ++i) {
+ struct blame_entry *n =
+ xcalloc(1, sizeof (struct blame_entry));
+ n->lno = e->lno + i;
+ n->num_lines = 1;
+ n->score = 0;
+
+ int matched_line = matched_lines[i + e->s_lno -
+ target_chunk_start];
+
+ if (matched_line != -1) {
+ n->suspect = blame_origin_incref(parent);
+ n->s_lno = matched_line;
+ n->next = parent_tail;
+ parent_tail = n;
+ } else {
+ n->suspect = blame_origin_incref(e->suspect);
+ n->s_lno = e->s_lno + i;
+ n->next = target_tail;
+ target_tail = n;
+ }
+ }
+
+ blame_origin_decref(e->suspect);
+ free(e);
+
+ e = next;
+ }
+
+ if (parent_tail) {
+ parent_tail = llist_mergesort(parent_tail, get_next_blame,
+ set_next_blame,
+ compare_blame_suspect);
+ *d->processed_entries = parent_tail;
+ while (parent_tail->next) parent_tail = parent_tail->next;
+ d->processed_entries = &parent_tail->next;
+ }
+
+ if (target_tail) {
+ target_tail = llist_mergesort(target_tail, get_next_blame,
+ set_next_blame,
+ compare_blame_suspect);
+ *d->target_entries = target_tail;
+ while (target_tail->next) target_tail = target_tail->next;
+ d->target_entries = &target_tail->next;
+ }
+
+ *d->target_entries = e;
+
+ free(matched_lines);
+
+finish:
+ d->offset = parent_chunk_start + parent_chunk_length -
+ (target_chunk_start + target_chunk_length);
+
+ return 0;
+}
+
+static int find_line_starts(int **line_starts, const char *buf, unsigned long len);
+
+static void pass_blame_to_parent_fuzzy(struct blame_scoreboard *sb,
+ struct blame_origin *target,
+ struct blame_origin *parent)
+{
+ mmfile_t file_p, file_o;
+ struct fuzzy_blame_parent_data d;
+ struct blame_entry *newdest = NULL;
+
+ if (!target->suspects)
+ return; /* nothing remains for this target */
+
+ d.parent = parent;
+ d.offset = 0;
+ d.processed_entries = &newdest;
+ d.target_entries = &target->suspects;
+
+ fill_origin_blob(&sb->revs->diffopt, parent, &file_p, &sb->num_read_blob);
+ fill_origin_blob(&sb->revs->diffopt, target, &file_o, &sb->num_read_blob);
+ sb->num_get_patch++;
+
+ d.parent_content = file_p.ptr;
+ d.target_content = file_o.ptr;
+ d.parent_line_count = find_line_starts(&d.parent_line_starts, file_p.ptr, file_p.size);
+ d.target_line_count = find_line_starts(&d.target_line_starts, file_o.ptr, file_o.size);
+
+ if (diff_hunks(&file_p, &file_o, blame_chunk_fuzzy, &d, sb->xdl_opts))
+ die("unable to generate diff (%s -> %s)",
+ oid_to_hex(&parent->commit->object.oid),
+ oid_to_hex(&target->commit->object.oid));
+
+ *d.processed_entries = NULL;
+ queue_blames(sb, parent, newdest);
+
+ free(d.target_line_starts);
+ free(d.parent_line_starts);
+
+ return;
+}
+
/*
* The lines in blame_entry after splitting blames many times can become
* very small and trivial, and at some point it becomes pointless to
@@ -1433,7 +1753,7 @@ static void pass_blame(struct blame_scoreboard *sb, struct blame_origin *origin,
struct commit *commit = origin->commit;
struct commit_list *sg;
struct blame_origin *sg_buf[MAXSG];
- struct blame_origin *porigin, **sg_origin = sg_buf;
+ struct blame_origin *porigin = NULL, **sg_origin = sg_buf;
struct blame_entry *toosmall = NULL;
struct blame_entry *blames, **blametail = &blames;
@@ -1560,6 +1880,11 @@ static void pass_blame(struct blame_scoreboard *sb, struct blame_origin *origin,
*tail = origin->suspects;
origin->suspects = toosmall;
}
+
+ if (sb->fuzzy && porigin) {
+ pass_blame_to_parent_fuzzy(sb, origin, porigin);
+ }
+
for (i = 0; i < num_sg; i++) {
if (sg_origin[i]) {
drop_origin_blob(sg_origin[i]);
@@ -1645,14 +1970,8 @@ static const char *get_next_line(const char *start, const char *end)
return nl ? nl + 1 : end;
}
-/*
- * To allow quick access to the contents of nth line in the
- * final image, prepare an index in the scoreboard.
- */
-static int prepare_lines(struct blame_scoreboard *sb)
+static int find_line_starts(int **line_starts, const char *buf, unsigned long len)
{
- const char *buf = sb->final_buf;
- unsigned long len = sb->final_buf_size;
const char *end = buf + len;
const char *p;
int *lineno;
@@ -1661,15 +1980,26 @@ static int prepare_lines(struct blame_scoreboard *sb)
for (p = buf; p < end; p = get_next_line(p, end))
num++;
- ALLOC_ARRAY(sb->lineno, num + 1);
- lineno = sb->lineno;
+ ALLOC_ARRAY(*line_starts, num + 1);
+ lineno = *line_starts;
for (p = buf; p < end; p = get_next_line(p, end))
*lineno++ = p - buf;
*lineno = len;
- sb->num_lines = num;
+ return num;
+}
+
+/*
+ * To allow quick access to the contents of nth line in the
+ * final image, prepare an index in the scoreboard.
+ */
+static int prepare_lines(struct blame_scoreboard *sb)
+{
+ sb->num_lines = find_line_starts(&sb->lineno,
+ sb->final_buf,
+ sb->final_buf_size);
return sb->num_lines;
}
@@ -142,6 +142,7 @@ struct blame_scoreboard {
int xdl_opts;
int no_whole_file_rename;
int debug;
+ int fuzzy;
/* callbacks */
void(*on_sanity_fail)(struct blame_scoreboard *, int);
@@ -57,6 +57,7 @@ static struct date_mode blame_date_mode = { DATE_ISO8601 };
static size_t blame_date_width;
static struct string_list mailmap = STRING_LIST_INIT_NODUP;
+static int fuzzy;
#ifndef DEBUG
#define DEBUG 0
@@ -808,6 +809,7 @@ int cmd_blame(int argc, const char **argv, const char *prefix)
OPT_BIT('w', NULL, &xdl_opts, N_("Ignore whitespace differences"), XDF_IGNORE_WHITESPACE),
OPT_BIT(0, "color-lines", &output_option, N_("color redundant metadata from previous line differently"), OUTPUT_COLOR_LINE),
OPT_BIT(0, "color-by-age", &output_option, N_("color lines by age"), OUTPUT_SHOW_AGE_WITH_COLOR),
+ OPT_BOOL('F', "fuzzy", &fuzzy, N_("Try to assign blame to similar lines in the parent")),
/*
* The following two options are parsed by parse_revision_opt()
@@ -996,6 +998,7 @@ int cmd_blame(int argc, const char **argv, const char *prefix)
init_scoreboard(&sb);
sb.revs = &revs;
+ sb.fuzzy = fuzzy;
sb.contents_from = contents_from;
sb.reverse = reverse;
sb.repo = the_repository;
new file mode 100755
@@ -0,0 +1,264 @@
+#!/bin/sh
+
+test_description='git blame ignore a specific revision'
+. ./test-lib.sh
+
+pick_author='s/^[0-9a-f^]* *(\([^ ]*\) .*/\1/'
+
+file_count=8
+
+# Each test is composed of 4 variables:
+# titleN - the test name
+# aN - the initial content
+# bN - the final content
+# expectedN - the line numbers from aN that we expect git blame
+# on bN to identify, or "Final" if bN itself should
+# be identified as the origin of that line.
+
+title1="Expand lines"
+cat <<EOF >a1
+aaa
+bbb
+ccc
+ddd
+eee
+EOF
+cat <<EOF >b1
+aaa
+bbbx
+bbbx
+ccc
+dddx
+dddx
+eee
+EOF
+cat <<EOF >expected1
+1
+2
+2
+3
+4
+4
+5
+EOF
+
+title2="Combine 3 lines into 2"
+cat <<EOF >a2
+if ((maxgrow==0) ||
+ ( single_line_field && (field->dcols < maxgrow)) ||
+ (!single_line_field && (field->drows < maxgrow)))
+EOF
+cat <<EOF >b2
+if ((maxgrow == 0) || (single_line_field && (field->dcols < maxgrow)) ||
+ (!single_line_field && (field->drows < maxgrow))) {
+EOF
+cat <<EOF >expected2
+2
+3
+EOF
+
+title3="Add curly brackets"
+cat <<EOF >a3
+ if (rows) *rows = field->rows;
+ if (cols) *cols = field->cols;
+ if (frow) *frow = field->frow;
+ if (fcol) *fcol = field->fcol;
+EOF
+cat <<EOF >b3
+ if (rows) {
+ *rows = field->rows;
+ }
+ if (cols) {
+ *cols = field->cols;
+ }
+ if (frow) {
+ *frow = field->frow;
+ }
+ if (fcol) {
+ *fcol = field->fcol;
+ }
+EOF
+cat <<EOF >expected3
+1
+1
+Final
+2
+2
+Final
+3
+3
+Final
+4
+4
+Final
+EOF
+
+
+title4="Combine many lines and change case"
+cat <<EOF >a4
+for(row=0,pBuffer=field->buf;
+ row<height;
+ row++,pBuffer+=width )
+ {
+ if ((len = (int)( After_End_Of_Data( pBuffer, width ) - pBuffer )) > 0)
+ {
+ wmove( win, row, 0 );
+ waddnstr( win, pBuffer, len );
+EOF
+cat <<EOF >b4
+for (Row = 0, PBuffer = field->buf; Row < Height; Row++, PBuffer += Width) {
+ if ((Len = (int)(afterEndOfData(PBuffer, Width) - PBuffer)) > 0) {
+ wmove(win, Row, 0);
+ waddnstr(win, PBuffer, Len);
+EOF
+cat <<EOF >expected4
+1
+5
+7
+8
+EOF
+
+title5="Rename and combine lines"
+cat <<EOF >a5
+bool need_visual_update = ((form != (FORM *)0) &&
+ (form->status & _POSTED) &&
+ (form->current==field));
+
+if (need_visual_update)
+ Synchronize_Buffer(form);
+
+if (single_line_field)
+ {
+ growth = field->cols * amount;
+ if (field->maxgrow)
+ growth = Minimum(field->maxgrow - field->dcols,growth);
+ field->dcols += growth;
+ if (field->dcols == field->maxgrow)
+EOF
+cat <<EOF >b5
+bool NeedVisualUpdate = ((Form != (FORM *)0) && (Form->status & _POSTED) &&
+ (Form->current == field));
+
+if (NeedVisualUpdate) {
+ synchronizeBuffer(Form);
+}
+
+if (SingleLineField) {
+ Growth = field->cols * amount;
+ if (field->maxgrow) {
+ Growth = Minimum(field->maxgrow - field->dcols, Growth);
+ }
+ field->dcols += Growth;
+ if (field->dcols == field->maxgrow) {
+EOF
+cat <<EOF >expected5
+1
+3
+4
+5
+6
+Final
+7
+8
+10
+11
+12
+Final
+13
+14
+EOF
+
+# Both lines match identically so position must be used to tie-break.
+title6="Same line twice"
+cat <<EOF >a6
+abc
+abc
+EOF
+cat <<EOF >b6
+abcd
+abcd
+EOF
+cat <<EOF >expected6
+1
+2
+EOF
+
+title7="Enforce line order"
+cat <<EOF >a7
+abcdef
+ghijkl
+ab
+EOF
+cat <<EOF >b7
+ghijk
+abcd
+EOF
+cat <<EOF >expected7
+2
+3
+EOF
+
+title8="Expand lines and rename variables"
+cat <<EOF >a8
+int myFunction(int ArgumentOne, Thing *ArgTwo, Blah XuglyBug) {
+ Squiggle FabulousResult = squargle(ArgumentOne, *ArgTwo,
+ XuglyBug) + EwwwGlobalWithAReallyLongNameYepTooLong;
+ return FabulousResult * 42;
+}
+EOF
+cat <<EOF >b8
+int myFunction(int argument_one, Thing *arg_asdfgh,
+ Blah xugly_bug) {
+ Squiggle fabulous_result = squargle(argument_one,
+ *arg_asdfgh, xugly_bug)
+ + g_ewww_global_with_a_really_long_name_yep_too_long;
+ return fabulous_result * 42;
+}
+EOF
+cat <<EOF >expected8
+1
+1
+2
+3
+3
+4
+5
+EOF
+
+test_expect_success setup '
+ { for ((i=1;i<=$file_count;i++))
+ do
+ # Append each line in a separate commit to make it easy to
+ # check which original line the blame output relates to.
+
+ line_count=0 &&
+ { while read line
+ do
+ line_count=$((line_count+1)) &&
+ echo $line >>"$i" &&
+ git add "$i" &&
+ test_tick &&
+ GIT_AUTHOR_NAME="$line_count" git commit -m "$line_count"
+ done } <"a$i"
+ done } &&
+
+ { for ((i=1;i<=$file_count;i++))
+ do
+ # Overwrite the files with the final content.
+ cp b$i $i &&
+ git add $i &&
+ test_tick
+ done } &&
+
+ # Commit the final content all at once so it can all be
+ # referred to with the same commit ID.
+ GIT_AUTHOR_NAME=Final git commit -m Final
+'
+
+for ((i=1;i<=$file_count;i++)); do
+ title="title$i"
+ test_expect_success "${!title}" \
+ "git blame --fuzzy -- $i | sed -e \"$pick_author\" >actual && test_cmp expected$i actual"
+done
+
+test_done
From: Michael Platings <michael@platin.gs> --- blame.c | 352 +++++++++++++++++++++++++++++++++++++++++++++++-- blame.h | 1 + builtin/blame.c | 3 + t/t8020-blame-fuzzy.sh | 264 +++++++++++++++++++++++++++++++++++++ 4 files changed, 609 insertions(+), 11 deletions(-) create mode 100755 t/t8020-blame-fuzzy.sh