Message ID | 3e83b51628b6f3aeb71c5e637eca03dd63f9e95f.1607223276.git.gitgitgadget@gmail.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | diffcore-rename improvements | expand |
On Sun, Dec 06, 2020 at 02:54:36AM +0000, Elijah Newren via GitGitGadget wrote: > diff --git a/diffcore-rename.c b/diffcore-rename.c > index 816d2fbac44..fb3c2817c6b 100644 > --- a/diffcore-rename.c > +++ b/diffcore-rename.c > @@ -5,67 +5,64 @@ > #include "cache.h" > #include "diff.h" > #include "diffcore.h" > -#include "object-store.h" > #include "hashmap.h" > +#include "object-store.h" Shuffling around this object-store.h include looks like noise to me, but maybe there's a good reason for doing it. > #include "progress.h" > #include "promisor-remote.h" > +#include "strmap.h" > > /* Table of rename/copy destinations */ > > static struct diff_rename_dst { > - struct diff_filespec *two; > - struct diff_filepair *pair; > + struct diff_filepair *p; > + struct diff_filespec *filespec_to_free; > + int is_rename; /* false -> just a create; true -> rename or copy */ > } *rename_dst; > static int rename_dst_nr, rename_dst_alloc; > +/* Mapping from break source pathname to break destination index */ > +static struct strintmap *break_idx = NULL; > > -static int find_rename_dst(struct diff_filespec *two) > -{ > - int first, last; > - > - first = 0; > - last = rename_dst_nr; > - while (last > first) { > - int next = first + ((last - first) >> 1); > - struct diff_rename_dst *dst = &(rename_dst[next]); > - int cmp = strcmp(two->path, dst->two->path); > - if (!cmp) > - return next; > - if (cmp < 0) { > - last = next; > - continue; > - } > - first = next+1; > - } > - return -first - 1; > -} > - > -static struct diff_rename_dst *locate_rename_dst(struct diff_filespec *two) > +static struct diff_rename_dst *locate_rename_dst(struct diff_filepair *p) > { > - int ofs = find_rename_dst(two); > - return ofs < 0 ? NULL : &rename_dst[ofs]; > + /* Lookup by p->ONE->path */ > + int idx = break_idx ? strintmap_get(break_idx, p->one->path) : -1; I had to lookup the behavior of strintmap_get() to realize that it returns the map's "default value" to figure out why this double ternary was necessary, but I think that it is. Ideally, if break_idx is non-NULL, then we ought to be able to immediately return NULL, but it's possible that break_idx is non-NULL and simply doesn't contain p->one->path, in which case we also want to return NULL. So, I think this is as clear as it can be. > + return (idx == -1) ? NULL : &rename_dst[idx]; > } > > /* > * Returns 0 on success, -1 if we found a duplicate. > */ > -static int add_rename_dst(struct diff_filespec *two) > +static int add_rename_dst(struct diff_filepair *p) > { > - int first = find_rename_dst(two); > - > - if (first >= 0) > + /* > + * See t4058; trees might have duplicate entries. I think > + * trees with duplicate entries should be ignored and we > + * should just leave rename detection on; while the results > + * may be slightly harder to understand, that's merely a > + * result of the underlying tree making no sense. But I > + * believe everything still works fine, the results do still > + * make sense, and the extra overhead of doing this checking > + * for a few historical weird trees from long ago seems like > + * the dog wagging the tail to me. > + * > + * However: I don't feel like fighting that battle right now. > + * For now, to keep the regression test passing, we have to > + * detect it. Since the diff machinery passes these to us in > + * adjacent pairs, we just need to check to see if our name > + * matches the previous one. > + * > + * TODO: Dispense with this test, rip out the test in t4058, and > + * lobby folks for the change. > + */ > + if (rename_dst_nr > 0 && > + !strcmp(rename_dst[rename_dst_nr-1].p->two->path, p->two->path)) > return -1; > - first = -first - 1; > > - /* insert to make it at "first" */ > ALLOC_GROW(rename_dst, rename_dst_nr + 1, rename_dst_alloc); > + rename_dst[rename_dst_nr].p = p; > + rename_dst[rename_dst_nr].filespec_to_free = NULL; > + rename_dst[rename_dst_nr].is_rename = 0; > rename_dst_nr++; > - if (first < rename_dst_nr) > - MOVE_ARRAY(rename_dst + first + 1, rename_dst + first, > - rename_dst_nr - first - 1); > - rename_dst[first].two = alloc_filespec(two->path); > - fill_filespec(rename_dst[first].two, &two->oid, two->oid_valid, > - two->mode); > - rename_dst[first].pair = NULL; > return 0; Very nice, this is much simpler than what was here before. > @@ -78,6 +75,14 @@ static int rename_src_nr, rename_src_alloc; > > static void register_rename_src(struct diff_filepair *p) > { > + if (p->broken_pair) { > + if (!break_idx) { > + break_idx = xmalloc(sizeof(*break_idx)); > + strintmap_init(break_idx, -1); > + } > + strintmap_set(break_idx, p->one->path, rename_dst_nr); > + } > + Makes sense. > @@ -664,8 +653,9 @@ void diffcore_rename(struct diff_options *options) > */ > if (DIFF_PAIR_BROKEN(p)) { > /* broken delete */ > - struct diff_rename_dst *dst = locate_rename_dst(p->one); > - if (dst && dst->pair) > + struct diff_rename_dst *dst = locate_rename_dst(p); > + assert(dst); You're not hurting anything, but I'm not convinced that this assert() is buying you anything that the line immediately below it isn't already doing. The rest of the patch looks good to me. Thanks, Taylor
On Wed, Dec 9, 2020 at 3:01 PM Taylor Blau <me@ttaylorr.com> wrote: > > On Sun, Dec 06, 2020 at 02:54:36AM +0000, Elijah Newren via GitGitGadget wrote: > > diff --git a/diffcore-rename.c b/diffcore-rename.c > > index 816d2fbac44..fb3c2817c6b 100644 > > --- a/diffcore-rename.c > > +++ b/diffcore-rename.c > > @@ -5,67 +5,64 @@ > > #include "cache.h" > > #include "diff.h" > > #include "diffcore.h" > > -#include "object-store.h" > > #include "hashmap.h" > > +#include "object-store.h" > > Shuffling around this object-store.h include looks like noise to me, but > maybe there's a good reason for doing it. Um, well...I guess I couldn't help myself from alphabetizing the #include list (at least the portion after the initial "cache.h" or "git-compat-util.h" must come first). ;-) I probably should have done it in a separate patch. > > #include "progress.h" > > #include "promisor-remote.h" > > +#include "strmap.h" > > > > /* Table of rename/copy destinations */ > > > > static struct diff_rename_dst { > > - struct diff_filespec *two; > > - struct diff_filepair *pair; > > + struct diff_filepair *p; > > + struct diff_filespec *filespec_to_free; > > + int is_rename; /* false -> just a create; true -> rename or copy */ > > } *rename_dst; > > static int rename_dst_nr, rename_dst_alloc; > > +/* Mapping from break source pathname to break destination index */ > > +static struct strintmap *break_idx = NULL; > > > > -static int find_rename_dst(struct diff_filespec *two) > > -{ > > - int first, last; > > - > > - first = 0; > > - last = rename_dst_nr; > > - while (last > first) { > > - int next = first + ((last - first) >> 1); > > - struct diff_rename_dst *dst = &(rename_dst[next]); > > - int cmp = strcmp(two->path, dst->two->path); > > - if (!cmp) > > - return next; > > - if (cmp < 0) { > > - last = next; > > - continue; > > - } > > - first = next+1; > > - } > > - return -first - 1; > > -} > > - > > -static struct diff_rename_dst *locate_rename_dst(struct diff_filespec *two) > > +static struct diff_rename_dst *locate_rename_dst(struct diff_filepair *p) > > { > > - int ofs = find_rename_dst(two); > > - return ofs < 0 ? NULL : &rename_dst[ofs]; > > + /* Lookup by p->ONE->path */ > > + int idx = break_idx ? strintmap_get(break_idx, p->one->path) : -1; > > I had to lookup the behavior of strintmap_get() to realize that it > returns the map's "default value" to figure out why this double > ternary was necessary, but I think that it is. > > Ideally, if break_idx is non-NULL, then we ought to be able to immediately > return NULL, but it's possible that break_idx is non-NULL and simply > doesn't contain p->one->path, in which case we also want to return NULL. > > So, I think this is as clear as it can be. > > > + return (idx == -1) ? NULL : &rename_dst[idx]; > > } > > > > /* > > * Returns 0 on success, -1 if we found a duplicate. > > */ > > -static int add_rename_dst(struct diff_filespec *two) > > +static int add_rename_dst(struct diff_filepair *p) > > { > > - int first = find_rename_dst(two); > > - > > - if (first >= 0) > > + /* > > + * See t4058; trees might have duplicate entries. I think > > + * trees with duplicate entries should be ignored and we > > + * should just leave rename detection on; while the results > > + * may be slightly harder to understand, that's merely a > > + * result of the underlying tree making no sense. But I > > + * believe everything still works fine, the results do still > > + * make sense, and the extra overhead of doing this checking > > + * for a few historical weird trees from long ago seems like > > + * the dog wagging the tail to me. > > + * > > + * However: I don't feel like fighting that battle right now. > > + * For now, to keep the regression test passing, we have to > > + * detect it. Since the diff machinery passes these to us in > > + * adjacent pairs, we just need to check to see if our name > > + * matches the previous one. > > + * > > + * TODO: Dispense with this test, rip out the test in t4058, and > > + * lobby folks for the change. > > + */ > > + if (rename_dst_nr > 0 && > > + !strcmp(rename_dst[rename_dst_nr-1].p->two->path, p->two->path)) > > return -1; > > - first = -first - 1; > > > > - /* insert to make it at "first" */ > > ALLOC_GROW(rename_dst, rename_dst_nr + 1, rename_dst_alloc); > > + rename_dst[rename_dst_nr].p = p; > > + rename_dst[rename_dst_nr].filespec_to_free = NULL; > > + rename_dst[rename_dst_nr].is_rename = 0; > > rename_dst_nr++; > > - if (first < rename_dst_nr) > > - MOVE_ARRAY(rename_dst + first + 1, rename_dst + first, > > - rename_dst_nr - first - 1); > > - rename_dst[first].two = alloc_filespec(two->path); > > - fill_filespec(rename_dst[first].two, &two->oid, two->oid_valid, > > - two->mode); > > - rename_dst[first].pair = NULL; > > return 0; > > Very nice, this is much simpler than what was here before. > > > @@ -78,6 +75,14 @@ static int rename_src_nr, rename_src_alloc; > > > > static void register_rename_src(struct diff_filepair *p) > > { > > + if (p->broken_pair) { > > + if (!break_idx) { > > + break_idx = xmalloc(sizeof(*break_idx)); > > + strintmap_init(break_idx, -1); > > + } > > + strintmap_set(break_idx, p->one->path, rename_dst_nr); > > + } > > + > > Makes sense. > > > @@ -664,8 +653,9 @@ void diffcore_rename(struct diff_options *options) > > */ > > if (DIFF_PAIR_BROKEN(p)) { > > /* broken delete */ > > - struct diff_rename_dst *dst = locate_rename_dst(p->one); > > - if (dst && dst->pair) > > + struct diff_rename_dst *dst = locate_rename_dst(p); > > + assert(dst); > > You're not hurting anything, but I'm not convinced that this assert() is > buying you anything that the line immediately below it isn't already > doing. It's identical for runtime correctness, yes. But the primary point isn't what happens at runtime, but what happens when future code readers come along. If I only keep the "if (dst->is_rename)" line that comes after without the assert, then someone in the future will come along (maybe even myself) and think "the original author wasn't being careful here; they should change this to a check on (dst && dst->is_rename)" (because honestly, it is the kind of mistake I'm prone to make). However, if they were to do so, and some bug gets introduced so that locate_rename_dst() returns a NULL for a pair of interest, then they've masked a bug in the algorithm and made it fail in harder-to-detect-and-track-down ways. I wanted it to be clear that dst == NULL is unacceptable. I guess I could have marked it with BUG(), but between an assertion and a NULL-pointer indirection, I figured that code aborting under that condition was pretty well covered. :-) > The rest of the patch looks good to me. > > Thanks, > Taylor
Elijah Newren <newren@gmail.com> writes: > It's identical for runtime correctness, yes. But the primary point > isn't what happens at runtime, but what happens when future code > readers come along. If I only keep the "if (dst->is_rename)" line > that comes after without the assert, then someone in the future will > come along (maybe even myself) and think "the original author wasn't > being careful here; they should change this to a check on (dst && > dst->is_rename)" (because honestly, it is the kind of mistake I'm > prone to make). However, if they were to do so, and some bug gets > introduced so that locate_rename_dst() returns a NULL for a pair of > interest, then they've masked a bug in the algorithm and made it fail > in harder-to-detect-and-track-down ways. I wanted it to be clear that > dst == NULL is unacceptable. I guess I could have marked it with > BUG(), but between an assertion and a NULL-pointer indirection, I > figured that code aborting under that condition was pretty well > covered. :-) assert() often gets turned into no-op, so it is more like leaving a comment in the code. A comment or BUG("text") can describe not just the fact that reaching this point with dst==NULL is a bug in the caller but also why it is a bug (e.g. how NULL-dst would screw up the computation downstream from this point), but an assert() cannot.
diff --git a/diffcore-rename.c b/diffcore-rename.c index 816d2fbac44..fb3c2817c6b 100644 --- a/diffcore-rename.c +++ b/diffcore-rename.c @@ -5,67 +5,64 @@ #include "cache.h" #include "diff.h" #include "diffcore.h" -#include "object-store.h" #include "hashmap.h" +#include "object-store.h" #include "progress.h" #include "promisor-remote.h" +#include "strmap.h" /* Table of rename/copy destinations */ static struct diff_rename_dst { - struct diff_filespec *two; - struct diff_filepair *pair; + struct diff_filepair *p; + struct diff_filespec *filespec_to_free; + int is_rename; /* false -> just a create; true -> rename or copy */ } *rename_dst; static int rename_dst_nr, rename_dst_alloc; +/* Mapping from break source pathname to break destination index */ +static struct strintmap *break_idx = NULL; -static int find_rename_dst(struct diff_filespec *two) -{ - int first, last; - - first = 0; - last = rename_dst_nr; - while (last > first) { - int next = first + ((last - first) >> 1); - struct diff_rename_dst *dst = &(rename_dst[next]); - int cmp = strcmp(two->path, dst->two->path); - if (!cmp) - return next; - if (cmp < 0) { - last = next; - continue; - } - first = next+1; - } - return -first - 1; -} - -static struct diff_rename_dst *locate_rename_dst(struct diff_filespec *two) +static struct diff_rename_dst *locate_rename_dst(struct diff_filepair *p) { - int ofs = find_rename_dst(two); - return ofs < 0 ? NULL : &rename_dst[ofs]; + /* Lookup by p->ONE->path */ + int idx = break_idx ? strintmap_get(break_idx, p->one->path) : -1; + return (idx == -1) ? NULL : &rename_dst[idx]; } /* * Returns 0 on success, -1 if we found a duplicate. */ -static int add_rename_dst(struct diff_filespec *two) +static int add_rename_dst(struct diff_filepair *p) { - int first = find_rename_dst(two); - - if (first >= 0) + /* + * See t4058; trees might have duplicate entries. I think + * trees with duplicate entries should be ignored and we + * should just leave rename detection on; while the results + * may be slightly harder to understand, that's merely a + * result of the underlying tree making no sense. But I + * believe everything still works fine, the results do still + * make sense, and the extra overhead of doing this checking + * for a few historical weird trees from long ago seems like + * the dog wagging the tail to me. + * + * However: I don't feel like fighting that battle right now. + * For now, to keep the regression test passing, we have to + * detect it. Since the diff machinery passes these to us in + * adjacent pairs, we just need to check to see if our name + * matches the previous one. + * + * TODO: Dispense with this test, rip out the test in t4058, and + * lobby folks for the change. + */ + if (rename_dst_nr > 0 && + !strcmp(rename_dst[rename_dst_nr-1].p->two->path, p->two->path)) return -1; - first = -first - 1; - /* insert to make it at "first" */ ALLOC_GROW(rename_dst, rename_dst_nr + 1, rename_dst_alloc); + rename_dst[rename_dst_nr].p = p; + rename_dst[rename_dst_nr].filespec_to_free = NULL; + rename_dst[rename_dst_nr].is_rename = 0; rename_dst_nr++; - if (first < rename_dst_nr) - MOVE_ARRAY(rename_dst + first + 1, rename_dst + first, - rename_dst_nr - first - 1); - rename_dst[first].two = alloc_filespec(two->path); - fill_filespec(rename_dst[first].two, &two->oid, two->oid_valid, - two->mode); - rename_dst[first].pair = NULL; return 0; } @@ -78,6 +75,14 @@ static int rename_src_nr, rename_src_alloc; static void register_rename_src(struct diff_filepair *p) { + if (p->broken_pair) { + if (!break_idx) { + break_idx = xmalloc(sizeof(*break_idx)); + strintmap_init(break_idx, -1); + } + strintmap_set(break_idx, p->one->path, rename_dst_nr); + } + ALLOC_GROW(rename_src, rename_src_nr + 1, rename_src_alloc); rename_src[rename_src_nr].p = p; rename_src[rename_src_nr].score = p->score; @@ -117,14 +122,14 @@ static void prefetch(void *prefetch_options) struct oid_array to_fetch = OID_ARRAY_INIT; for (i = 0; i < rename_dst_nr; i++) { - if (rename_dst[i].pair) + if (rename_dst[i].p->renamed_pair) /* * The loop in diffcore_rename() will not need these * blobs, so skip prefetching. */ continue; /* already found exact match */ diff_add_if_missing(options->repo, &to_fetch, - rename_dst[i].two); + rename_dst[i].p->two); } for (i = 0; i < rename_src_nr; i++) { if (options->skip_unmodified && @@ -234,26 +239,24 @@ static int estimate_similarity(struct repository *r, static void record_rename_pair(int dst_index, int src_index, int score) { - struct diff_filespec *src, *dst; - struct diff_filepair *dp; + struct diff_filepair *src = rename_src[src_index].p; + struct diff_filepair *dst = rename_dst[dst_index].p; - if (rename_dst[dst_index].pair) + if (dst->renamed_pair) die("internal error: dst already matched."); - src = rename_src[src_index].p->one; - src->rename_used++; - src->count++; + src->one->rename_used++; + src->one->count++; - dst = rename_dst[dst_index].two; - dst->count++; + rename_dst[dst_index].filespec_to_free = dst->one; + rename_dst[dst_index].is_rename = 1; - dp = diff_queue(NULL, src, dst); - dp->renamed_pair = 1; - if (!strcmp(src->path, dst->path)) - dp->score = rename_src[src_index].score; + dst->one = src->one; + dst->renamed_pair = 1; + if (!strcmp(dst->one->path, dst->two->path)) + dst->score = rename_src[src_index].score; else - dp->score = score; - rename_dst[dst_index].pair = dp; + dst->score = score; } /* @@ -299,7 +302,7 @@ static int find_identical_files(struct hashmap *srcs, struct diff_options *options) { int renames = 0; - struct diff_filespec *target = rename_dst[dst_index].two; + struct diff_filespec *target = rename_dst[dst_index].p->two; struct file_similarity *p, *best = NULL; int i = 100, best_score = -1; unsigned int hash = hash_filespec(options->repo, target); @@ -460,7 +463,7 @@ static int find_renames(struct diff_score *mx, int dst_cnt, int minimum_score, i (mx[i].score < minimum_score)) break; /* there is no more usable pair. */ dst = &rename_dst[mx[i].dst]; - if (dst->pair) + if (dst->is_rename) continue; /* already done, either exact or fuzzy. */ if (!copies && rename_src[mx[i].src].p->one->rename_used) continue; @@ -495,7 +498,7 @@ void diffcore_rename(struct diff_options *options) else if (!options->flags.rename_empty && is_empty_blob_oid(&p->two->oid)) continue; - else if (add_rename_dst(p->two) < 0) { + else if (add_rename_dst(p) < 0) { warning("skipping rename detection, detected" " duplicate destination '%s'", p->two->path); @@ -568,10 +571,10 @@ void diffcore_rename(struct diff_options *options) mx = xcalloc(st_mult(NUM_CANDIDATE_PER_DST, num_targets), sizeof(*mx)); for (dst_cnt = i = 0; i < rename_dst_nr; i++) { - struct diff_filespec *two = rename_dst[i].two; + struct diff_filespec *two = rename_dst[i].p->two; struct diff_score *m; - if (rename_dst[i].pair) + if (rename_dst[i].is_rename) continue; /* dealt with exact match already. */ m = &mx[dst_cnt * NUM_CANDIDATE_PER_DST]; @@ -628,22 +631,8 @@ void diffcore_rename(struct diff_options *options) diff_q(&outq, p); } else if (!DIFF_FILE_VALID(p->one) && DIFF_FILE_VALID(p->two)) { - /* - * Addition - * - * We would output this add record if it has - * not been turned into a rename/copy already. - */ - struct diff_rename_dst *dst = locate_rename_dst(p->two); - if (dst && dst->pair) { - diff_q(&outq, dst->pair); - pair_to_free = p; - } - else - /* no matching rename/copy source, so - * record this as an addition. - */ - diff_q(&outq, p); + /* Addition */ + diff_q(&outq, p); } else if (DIFF_FILE_VALID(p->one) && !DIFF_FILE_VALID(p->two)) { /* @@ -664,8 +653,9 @@ void diffcore_rename(struct diff_options *options) */ if (DIFF_PAIR_BROKEN(p)) { /* broken delete */ - struct diff_rename_dst *dst = locate_rename_dst(p->one); - if (dst && dst->pair) + struct diff_rename_dst *dst = locate_rename_dst(p); + assert(dst); + if (dst->is_rename) /* counterpart is now rename/copy */ pair_to_free = p; } @@ -675,16 +665,14 @@ void diffcore_rename(struct diff_options *options) pair_to_free = p; } - if (pair_to_free) - ; - else + if (!pair_to_free) diff_q(&outq, p); } else if (!diff_unmodified_pair(p)) /* all the usual ones need to be kept */ diff_q(&outq, p); else - /* no need to keep unmodified pairs */ + /* no need to keep unmodified pairs; FIXME: remove earlier? */ pair_to_free = p; if (pair_to_free) @@ -697,11 +685,16 @@ void diffcore_rename(struct diff_options *options) diff_debug_queue("done collapsing", q); for (i = 0; i < rename_dst_nr; i++) - free_filespec(rename_dst[i].two); + if (rename_dst[i].filespec_to_free) + free_filespec(rename_dst[i].filespec_to_free); FREE_AND_NULL(rename_dst); rename_dst_nr = rename_dst_alloc = 0; FREE_AND_NULL(rename_src); rename_src_nr = rename_src_alloc = 0; + if (break_idx) { + strintmap_clear(break_idx); + FREE_AND_NULL(break_idx); + } return; }