[15/20] merge-ort: step 3 of tree writing -- handling subdirectories as we go

Message ID	29615c366f46ced1a4b0a17d8e3ec570f60ec437.1606635803.git.gitgitgadget@gmail.com (mailing list archive)
State	New, archived
Headers	show Return-Path: <git-owner@kernel.org> Message-Id: <29615c366f46ced1a4b0a17d8e3ec570f60ec437.1606635803.git.gitgitgadget@gmail.com> In-Reply-To: <pull.923.git.git.1606635803.gitgitgadget@gmail.com> References: <pull.923.git.git.1606635803.gitgitgadget@gmail.com> Date: Sun, 29 Nov 2020 07:43:18 +0000 Subject: [PATCH 15/20] merge-ort: step 3 of tree writing -- handling subdirectories as we go Fcc: Sent Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MIME-Version: 1.0 To: git@vger.kernel.org Cc: Elijah Newren <newren@gmail.com>, Elijah Newren <newren@gmail.com> Precedence: bulk From: Elijah Newren <newren@gmail.com>
Series	fundamentals of merge-ort implementation \| expand [00/20] fundamentals of merge-ort implementation [01/20] merge-ort: setup basic internal data structures [02/20] merge-ort: add some high-level algorithm structure [03/20] merge-ort: port merge_start() from merge-recursive [04/20] merge-ort: use histogram diff [05/20] merge-ort: add an err() function similar to one from merge-recursive [06/20] merge-ort: implement a very basic collect_merge_info() [07/20] merge-ort: avoid repeating fill_tree_descriptor() on the same tree [08/20] merge-ort: compute a few more useful fields for collect_merge_info [09/20] merge-ort: record stage and auxiliary info for every path [10/20] merge-ort: avoid recursing into identical trees [11/20] merge-ort: add a preliminary simple process_entries() implementation [12/20] merge-ort: have process_entries operate in a defined order [13/20] merge-ort: step 1 of tree writing -- record basenames, modes, and oids [14/20] merge-ort: step 2 of tree writing -- function to create tree object [15/20] merge-ort: step 3 of tree writing -- handling subdirectories as we go [16/20] merge-ort: basic outline for merge_switch_to_result() [17/20] merge-ort: add implementation of checkout() [18/20] tree: enable cmp_cache_name_compare() to be used elsewhere [19/20] merge-ort: add implementation of record_conflicted_index_entries() [20/20] merge-ort: free data structures in merge_finalize()

diff --git a/merge-ort.c b/merge-ort.c index 59355de628..65dbdadc5e 100644 --- a/merge-ort.c +++ b/merge-ort.c @@ -494,7 +494,46 @@ static int string_list_df_name_compare(const char *one, const char *two) } struct directory_versions { + /* + * versions: list of (basename -> version_info) + * + * The basenames are in reverse lexicographic order of full pathnames, + * as processed in process_entries(). This puts all entries within + * a directory together, and covers the directory itself after + * everything within it, allowing us to write subtrees before needing + * to record information for the tree itself. + */ struct string_list versions; + + /* + * offsets: list of (full relative path directories -> integer offsets) + * + * Since versions contains basenames from files in multiple different + * directories, we need to know which entries in versions correspond + * to which directories. Values of e.g. + * "" 0 + * src 2 + * src/moduleA 5 + * Would mean that entries 0-1 of versions are files in the toplevel + * directory, entries 2-4 are files under src/, and the remaining + * entries starting at index 5 are files under src/moduleA/. + */ + struct string_list offsets; + + /* + * last_directory: directory that previously processed file found in + * + * last_directory starts NULL, but records the directory in which the + * previous file was found within. As soon as + * directory(current_file) != last_directory + * then we need to start updating accounting in versions & offsets. + * Note that last_directory is always the last path in "offsets" (or + * NULL if "offsets" is empty) so this exists just for quick access. + */ + const char *last_directory; + + /* last_directory_len: cached computation of strlen(last_directory) */ + unsigned last_directory_len; }; static void write_tree(struct object_id *result_oid, @@ -558,6 +597,181 @@ static void record_entry_for_tree(struct directory_versions *dir_metadata, basename)->util = &mi->result; } +static void write_completed_directory(struct merge_options *opt, + const char *new_directory_name, + struct directory_versions *info) +{ + const char *prev_dir; + struct merged_info *dir_info = NULL; + unsigned int offset; + + /* + * Some explanation of info->versions and info->offsets... + * + * process_entries() iterates over all relevant files AND + * directories in reverse lexicographic order, and calls this + * function. Thus, an example of the paths that process_entries() + * could operate on (along with the directories for those paths + * being shown) is: + * + * xtract.c "" + * tokens.txt "" + * src/moduleB/umm.c src/moduleB + * src/moduleB/stuff.h src/moduleB + * src/moduleB/baz.c src/moduleB + * src/moduleB src + * src/moduleA/foo.c src/moduleA + * src/moduleA/bar.c src/moduleA + * src/moduleA src + * src "" + * Makefile "" + * + * info->versions: + * + * always contains the unprocessed entries and their + * version_info information. For example, after the first five + * entries above, info->versions would be: + * + * xtract.c <xtract.c's version_info> + * token.txt <token.txt's version_info> + * umm.c <src/moduleB/umm.c's version_info> + * stuff.h <src/moduleB/stuff.h's version_info> + * baz.c <src/moduleB/baz.c's version_info> + * + * Once a subdirectory is completed we remove the entries in + * that subdirectory from info->versions, writing it as a tree + * (write_tree()). Thus, as soon as we get to src/moduleB, + * info->versions would be updated to + * + * xtract.c <xtract.c's version_info> + * token.txt <token.txt's version_info> + * moduleB <src/moduleB's version_info> + * + * info->offsets: + * + * helps us track which entries in info->versions correspond to + * which directories. When we are N directories deep (e.g. 4 + * for src/modA/submod/subdir/), we have up to N+1 unprocessed + * directories (+1 because of toplevel dir). Corresponding to + * the info->versions example above, after processing five entries + * info->offsets will be: + * + * "" 0 + * src/moduleB 2 + * + * which is used to know that xtract.c & token.txt are from the + * toplevel dirctory, while umm.c & stuff.h & baz.c are from the + * src/moduleB directory. Again, following the example above, + * once we need to process src/moduleB, then info->offsets is + * updated to + * + * "" 0 + * src 2 + * + * which says that moduleB (and only moduleB so far) is in the + * src directory. + * + * One unique thing to note about info->offsets here is that + * "src" was not added to info->offsets until there was a path + * (a file OR directory) immediately below src/ that got + * processed. + * + * Since process_entry() just appends new entries to info->versions, + * write_completed_directory() only needs to do work if the next path + * is in a directory that is different than the last directory found + * in info->offsets. + */ + + /* + * If we are working with the same directory as the last entry, there + * is no work to do. (See comments above the directory_name member of + * struct merged_info for why we can use pointer comparison instead of + * strcmp here.) + */ + if (new_directory_name == info->last_directory) + return; + + /* + * If we are just starting (last_directory is NULL), or last_directory + * is a prefix of the current directory, then we can just update + * info->offsets to record the offset where we started this directory + * and update last_directory to have quick access to it. + */ + if (info->last_directory == NULL || + !strncmp(new_directory_name, info->last_directory, + info->last_directory_len)) { + uintptr_t offset = info->versions.nr; + + info->last_directory = new_directory_name; + info->last_directory_len = strlen(info->last_directory); + /* + * Record the offset into info->versions where we will + * start recording basenames of paths found within + * new_directory_name. + */ + string_list_append(&info->offsets, + info->last_directory)->util = (void*)offset; + return; + } + + /* + * The next entry that will be processed will be within + * new_directory_name. Since at this point we know that + * new_directory_name is within a different directory than + * info->last_directory, we have all entries for info->last_directory + * in info->versions and we need to create a tree object for them. + */ + dir_info = strmap_get(&opt->priv->paths, info->last_directory); + assert(dir_info); + offset = (uintptr_t)info->offsets.items[info->offsets.nr-1].util; + if (offset == info->versions.nr) { + /* + * Actually, we don't need to create a tree object in this + * case. Whenever all files within a directory disappear + * during the merge (e.g. unmodified on one side and + * deleted on the other, or files were renamed elsewhere), + * then we get here and the directory itself needs to be + * omitted from its parent tree as well. + */ + dir_info->is_null = 1; + } else { + /* + * Write out the tree to the git object directory, and also + * record the mode and oid in dir_info->result. + */ + dir_info->is_null = 0; + dir_info->result.mode = S_IFDIR; + write_tree(&dir_info->result.oid, &info->versions, offset, + opt->repo->hash_algo->rawsz); + } + + /* + * We've now used several entries from info->versions and one entry + * from info->offsets, so we get rid of those values. + */ + info->offsets.nr--; + info->versions.nr = offset; + + /* + * Now we've taken care of the completed directory, but we need to + * prepare things since future entries will be in + * new_directory_name. (In particular, process_entry() will be + * appending new entries to info->versions.) So, we need to make + * sure new_directory_name is the last entry in info->offsets. + */ + prev_dir = info->offsets.nr == 0 ? NULL : + info->offsets.items[info->offsets.nr-1].string; + if (new_directory_name != prev_dir) { + uintptr_t c = info->versions.nr; + string_list_append(&info->offsets, + new_directory_name)->util = (void*)c; + } + + /* And, of course, we need to update last_directory to match. */ + info->last_directory = new_directory_name; + info->last_directory_len = strlen(info->last_directory); +} + /* Per entry merge function */ static void process_entry(struct merge_options *opt, const char *path, @@ -656,7 +870,9 @@ static void process_entries(struct merge_options *opt, struct strmap_entry *e; struct string_list plist = STRING_LIST_INIT_NODUP; struct string_list_item *entry; - struct directory_versions dir_metadata = { STRING_LIST_INIT_NODUP }; + struct directory_versions dir_metadata = { STRING_LIST_INIT_NODUP, + STRING_LIST_INIT_NODUP, + NULL, 0 }; if (strmap_empty(&opt->priv->paths)) { oidcpy(result_oid, opt->repo->hash_algo->empty_tree); @@ -676,6 +892,11 @@ static void process_entries(struct merge_options *opt, /* * Iterate over the items in reverse order, so we can handle paths * below a directory before needing to handle the directory itself. + * + * This allows us to write subtrees before we need to write trees, + * and it also enables sane handling of directory/file conflicts + * (because it allows us to know whether the directory is still in + * the way when it is time to process the file at the same path). */ for (entry = &plist.items[plist.nr-1]; entry >= plist.items; --entry) { char *path = entry->string; @@ -686,6 +907,8 @@ static void process_entries(struct merge_options *opt, */ struct merged_info *mi = entry->util; + write_completed_directory(opt, mi->directory_name, + &dir_metadata); if (mi->clean) record_entry_for_tree(&dir_metadata, path, mi); else { @@ -694,17 +917,20 @@ static void process_entries(struct merge_options *opt, } } - /* - * TODO: We can't actually write a tree yet, because dir_metadata just - * contains all basenames of all files throughout the tree with their - * mode and hash. Not only is that a nonsensical tree, it will have - * lots of duplicates for paths such as "Makefile" or ".gitignore". - */ - die("Not yet implemented; need to process subtrees separately"); + if (dir_metadata.offsets.nr != 1 || + (uintptr_t)dir_metadata.offsets.items[0].util != 0) { + printf("dir_metadata.offsets.nr = %d (should be 1)\n", + dir_metadata.offsets.nr); + printf("dir_metadata.offsets.items[0].util = %u (should be 0)\n", + (unsigned)(uintptr_t)dir_metadata.offsets.items[0].util); + fflush(stdout); + BUG("dir_metadata accounting completely off; shouldn't happen"); + } write_tree(result_oid, &dir_metadata.versions, 0, opt->repo->hash_algo->rawsz); string_list_clear(&plist, 0); string_list_clear(&dir_metadata.versions, 0); + string_list_clear(&dir_metadata.offsets, 0); } void merge_switch_to_result(struct merge_options *opt,

[15/20] merge-ort: step 3 of tree writing -- handling subdirectories as we go

Commit Message

Patch