diff mbox series

[1/4] submodule-config: add submodules_of_tree() helper

Message ID 20211122223252.19922-2-chooglen@google.com (mailing list archive)
State Superseded
Headers show
Series implement branch --recurse-submodules | expand

Commit Message

Glen Choo Nov. 22, 2021, 10:32 p.m. UTC
As we introduce a submodule UX with branches, we would like to be able
to get the submodule commit ids in a superproject tree because those ids
are the source of truth e.g. "git branch --recurse-submodules topic
start-point" should create branches based off the commit ids recorded in
the superproject's 'start-point' tree.

To make this easy, introduce a submodules_of_tree() helper function that
iterates through a tree and returns the tree's gitlink entries as a
list.

Signed-off-by: Glen Choo <chooglen@google.com>
---
 submodule-config.c | 19 +++++++++++++++++++
 submodule-config.h | 13 +++++++++++++
 2 files changed, 32 insertions(+)

Comments

Jonathan Tan Nov. 23, 2021, 2:12 a.m. UTC | #1
Glen Choo <chooglen@google.com> writes:
> +struct submodule_entry_list *
> +submodules_of_tree(struct repository *r, const struct object_id *treeish_name)
> +{
> +	struct tree_desc tree;
> +	struct name_entry entry;
> +	struct submodule_entry_list *ret;
> +
> +	CALLOC_ARRAY(ret, 1);
> +	fill_tree_descriptor(r, &tree, treeish_name);
> +	while (tree_entry(&tree, &entry)) {

I think that tree_entry() doesn't recurse into subtrees, but in any case we
should test this. (I looked at patch 4 and I think that the submodules are
always in the root tree.)

This reminded me of a similar thing when fetching submodules recursively and we
needed the "before" and "after" of submodule gitlinks. You can look at the code
(collect_changed_submodules_cb() and the functions that use it in submodule.c)
but it may not be useful - in particular, that uses diff since we need to see
differences there, but we don't need that here.

I'll review the other patches tomorrow.
Ævar Arnfjörð Bjarmason Nov. 23, 2021, 10:53 a.m. UTC | #2
On Mon, Nov 22 2021, Glen Choo wrote:

> As we introduce a submodule UX with branches, we would like to be able
> to get the submodule commit ids in a superproject tree because those ids
> are the source of truth e.g. "git branch --recurse-submodules topic
> start-point" should create branches based off the commit ids recorded in
> the superproject's 'start-point' tree.
>
> To make this easy, introduce a submodules_of_tree() helper function that
> iterates through a tree and returns the tree's gitlink entries as a
> list.
>
> Signed-off-by: Glen Choo <chooglen@google.com>
> ---
>  submodule-config.c | 19 +++++++++++++++++++
>  submodule-config.h | 13 +++++++++++++
>  2 files changed, 32 insertions(+)
>
> diff --git a/submodule-config.c b/submodule-config.c
> index f95344028b..97da373301 100644
> --- a/submodule-config.c
> +++ b/submodule-config.c
> @@ -7,6 +7,7 @@
>  #include "strbuf.h"
>  #include "object-store.h"
>  #include "parse-options.h"
> +#include "tree-walk.h"
>  
>  /*
>   * submodule cache lookup structure
> @@ -726,6 +727,24 @@ const struct submodule *submodule_from_path(struct repository *r,
>  	return config_from(r->submodule_cache, treeish_name, path, lookup_path);
>  }
>  
> +struct submodule_entry_list *
> +submodules_of_tree(struct repository *r, const struct object_id *treeish_name)
> +{
> +	struct tree_desc tree;
> +	struct name_entry entry;
> +	struct submodule_entry_list *ret;
> +
> +	CALLOC_ARRAY(ret, 1);
> +	fill_tree_descriptor(r, &tree, treeish_name);
> +	while (tree_entry(&tree, &entry)) {
> +		if (!S_ISGITLINK(entry.mode))
> +			continue;
> +		ALLOC_GROW(ret->name_entries, ret->entry_nr + 1, ret->entry_alloc);
> +		ret->name_entries[ret->entry_nr++] = entry;
> +	}
> +	return ret;
> +}
> +
>  void submodule_free(struct repository *r)
>  {
>  	if (r->submodule_cache)
> diff --git a/submodule-config.h b/submodule-config.h
> index 65875b94ea..4379ec77e3 100644
> --- a/submodule-config.h
> +++ b/submodule-config.h
> @@ -6,6 +6,7 @@
>  #include "hashmap.h"
>  #include "submodule.h"
>  #include "strbuf.h"
> +#include "tree-walk.h"
>  
>  /**
>   * The submodule config cache API allows to read submodule
> @@ -67,6 +68,18 @@ const struct submodule *submodule_from_name(struct repository *r,
>  					    const struct object_id *commit_or_tree,
>  					    const char *name);
>  
> +struct submodule_entry_list {
> +	struct name_entry *name_entries;
> +	int entry_nr;
> +	int entry_alloc;
> +};
> +
> +/**
> + * Given a tree-ish, return all submodules in the tree.
> + */
> +struct submodule_entry_list *
> +submodules_of_tree(struct repository *r, const struct object_id *treeish_name);
> +
>  /**
>   * Given a tree-ish in the superproject and a path, return the submodule that
>   * is bound at the path in the named tree.

Having skimmed through this topic isn't this in 4/4 the only resulting caller:
	
	+void create_submodule_branches(struct repository *r, const char *name,
	+			       const char *start_name, int force, int reflog,
	+			       int quiet, enum branch_track track)
	+{
	+	int i = 0;
	+	char *branch_point = NULL;
	+	struct repository *subrepos;
	+	struct submodule *submodules;
	+	struct object_id super_oid;
	+	struct submodule_entry_list *submodule_entry_list;
	+	char *err_msg = NULL;
	+
	+	validate_branch_start(r, start_name, track, &super_oid, &branch_point);
	+
	+	submodule_entry_list = submodules_of_tree(r, &super_oid);
	+	CALLOC_ARRAY(subrepos, submodule_entry_list->entry_nr);
	+	CALLOC_ARRAY(submodules, submodule_entry_list->entry_nr);
	+
	+	for (i = 0; i < submodule_entry_list->entry_nr; i++) {

I think it would be better to just intorduce this function at the same
time as its (only?) user, which also makes it clear how it's used.

In this case this seems like quite a bit of over-allocation. I.e. we
return a malloc'd pointer, and iterate with tree_entry(), the caller
then needs to loop over that and do its own allocations of "struct
repository *" and "struct submodule *".

Wouldn't it be better just to have this new submodule_entry_list contain
a list of not "struct name_entry", but:

    struct new_thingy {
        struct name_entry *entry;
        struct repository *repo;
        struct submodule *submodule;
    }

Then have the caller allocate the container on the stack, pass it to
this function.

Maybe not, just musings while doing some light reading. I was surprised
at what are effectively two loops over the same data, first allocating
1/3, then the other doing the other 2/3...
Glen Choo Nov. 23, 2021, 6:35 p.m. UTC | #3
Ævar Arnfjörð Bjarmason <avarab@gmail.com> writes:

> Having skimmed through this topic isn't this in 4/4 the only resulting caller:
> 	
> 	+void create_submodule_branches(struct repository *r, const char *name,
> 	+			       const char *start_name, int force, int reflog,
> 	+			       int quiet, enum branch_track track)
> 	+{
> 	+	int i = 0;
> 	+	char *branch_point = NULL;
> 	+	struct repository *subrepos;
> 	+	struct submodule *submodules;
> 	+	struct object_id super_oid;
> 	+	struct submodule_entry_list *submodule_entry_list;
> 	+	char *err_msg = NULL;
> 	+
> 	+	validate_branch_start(r, start_name, track, &super_oid, &branch_point);
> 	+
> 	+	submodule_entry_list = submodules_of_tree(r, &super_oid);
> 	+	CALLOC_ARRAY(subrepos, submodule_entry_list->entry_nr);
> 	+	CALLOC_ARRAY(submodules, submodule_entry_list->entry_nr);
> 	+
> 	+	for (i = 0; i < submodule_entry_list->entry_nr; i++) {
>
> I think it would be better to just intorduce this function at the same
> time as its (only?) user, which also makes it clear how it's used.

Yes that makes sense. That is the only user (for now). 

> In this case this seems like quite a bit of over-allocation. I.e. we
> return a malloc'd pointer, and iterate with tree_entry(), the caller
> then needs to loop over that and do its own allocations of "struct
> repository *" and "struct submodule *".
>
> Wouldn't it be better just to have this new submodule_entry_list contain
> a list of not "struct name_entry", but:
>
>     struct new_thingy {
>         struct name_entry *entry;
>         struct repository *repo;
>         struct submodule *submodule;
>     }
>
> Then have the caller allocate the container on the stack, pass it to
> this function.

I thought about it as well. "struct new_thingy" is obviously the right
struct for create_submodule_branches(), but I'm not sure if it is the
right thing for other future callers e.g. "struct submodule" is only
used to give a submodule name to users in help messages.

But chances are, any caller that needs 'submodules of a tree' will need
very similar pieces of information, so it seems reasonable to do what
you said instead of over-allocating in all of the callers.

> Maybe not, just musings while doing some light reading. I was surprised
> at what are effectively two loops over the same data, first allocating
> 1/3, then the other doing the other 2/3...

The first loop validates all submodules before creating any branches
(and also happens to allocate). If we didn't have the validation step,
allocation + creating branches could just be one loop :)
Glen Choo Nov. 23, 2021, 7:48 p.m. UTC | #4
Jonathan Tan <jonathantanmy@google.com> writes:

> I think that tree_entry() doesn't recurse into subtrees, but in any case we
> should test this. (I looked at patch 4 and I think that the submodules are
> always in the root tree.)

I've tested this and indeed it doesn't work. I've attached my test case
below.

> This reminded me of a similar thing when fetching submodules recursively and we
> needed the "before" and "after" of submodule gitlinks. You can look at the code
> (collect_changed_submodules_cb() and the functions that use it in submodule.c)
> but it may not be useful - in particular, that uses diff since we need to see
> differences there, but we don't need that here.

Thanks for the hint. If that fails, I could also implement it via the
helper methods in submodule--helper.

---- >8 ----

diff --git a/t/t3207-branch-submodule.sh b/t/t3207-branch-submodule.sh
index 14ff066e91..0e086f716d 100755
--- a/t/t3207-branch-submodule.sh
+++ b/t/t3207-branch-submodule.sh
@@ -11,11 +11,15 @@ export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
 test_expect_success 'setup superproject and submodule' '
 	git init super &&
 	test_commit foo &&
+	git init sub-sub-upstream &&
+	test_commit -C sub-sub-upstream foo &&
 	git init sub-upstream &&
-	test_commit -C sub-upstream foo &&
-	git -C super submodule add ../sub-upstream sub &&
+	git -C sub-upstream submodule add "$TRASH_DIRECTORY/sub-sub-upstream" sub-sub &&
+	git -C sub-upstream commit -m "add submodule" &&
+	git -C super submodule add "$TRASH_DIRECTORY/sub-upstream" sub &&
 	git -C super commit -m "add submodule" &&
-	git -C super config submodule.propagateBranches true
+	git -C super config submodule.propagateBranches true &&
+	git -C super/sub submodule update --init
 '
 
 cleanup_branches() {
@@ -26,7 +30,7 @@ cleanup_branches() {
 		git checkout main &&
 		for branch_name in "$@"; do
 			git branch -D "$branch_name"
-			git submodule foreach "(git checkout main && git branch -D $branch_name) || true"
+			git submodule foreach "cleanup_branches . $branch_name || true"
 		done
 	)
 } >/dev/null 2>/dev/null
@@ -37,8 +41,9 @@ test_expect_success '--recurse-submodules should create branches' '
 	(
 		cd super &&
 		git branch --recurse-submodules branch-a &&
-		git rev-parse --abbrev-ref branch-a &&
-		git -C sub rev-parse --abbrev-ref branch-a
+		git rev-parse branch-a &&
+		git -C sub rev-parse branch-a &&
+		git -C sub/sub-sub rev-parse branch-a
 	)
 '
Junio C Hamano Nov. 23, 2021, 10:46 p.m. UTC | #5
Glen Choo <chooglen@google.com> writes:

> +struct submodule_entry_list *
> +submodules_of_tree(struct repository *r, const struct object_id *treeish_name)
> +{
> +	struct tree_desc tree;
> +	struct name_entry entry;
> +	struct submodule_entry_list *ret;
> +
> +	CALLOC_ARRAY(ret, 1);
> +	fill_tree_descriptor(r, &tree, treeish_name);
> +	while (tree_entry(&tree, &entry)) {
> +		if (!S_ISGITLINK(entry.mode))
> +			continue;
> +		ALLOC_GROW(ret->name_entries, ret->entry_nr + 1, ret->entry_alloc);
> +		ret->name_entries[ret->entry_nr++] = entry;
> +	}
> +	return ret;
> +}

This only looks at the root level of the tree, doesn't it?  Without
any caller in the same step, it is impossible to tell if that is an
outright bug, or merely an incomplete code that will gain recursion
in a later step.

If it is the latter, I do not think that is a patch series
organization that is very friendly to reviewers.
diff mbox series

Patch

diff --git a/submodule-config.c b/submodule-config.c
index f95344028b..97da373301 100644
--- a/submodule-config.c
+++ b/submodule-config.c
@@ -7,6 +7,7 @@ 
 #include "strbuf.h"
 #include "object-store.h"
 #include "parse-options.h"
+#include "tree-walk.h"
 
 /*
  * submodule cache lookup structure
@@ -726,6 +727,24 @@  const struct submodule *submodule_from_path(struct repository *r,
 	return config_from(r->submodule_cache, treeish_name, path, lookup_path);
 }
 
+struct submodule_entry_list *
+submodules_of_tree(struct repository *r, const struct object_id *treeish_name)
+{
+	struct tree_desc tree;
+	struct name_entry entry;
+	struct submodule_entry_list *ret;
+
+	CALLOC_ARRAY(ret, 1);
+	fill_tree_descriptor(r, &tree, treeish_name);
+	while (tree_entry(&tree, &entry)) {
+		if (!S_ISGITLINK(entry.mode))
+			continue;
+		ALLOC_GROW(ret->name_entries, ret->entry_nr + 1, ret->entry_alloc);
+		ret->name_entries[ret->entry_nr++] = entry;
+	}
+	return ret;
+}
+
 void submodule_free(struct repository *r)
 {
 	if (r->submodule_cache)
diff --git a/submodule-config.h b/submodule-config.h
index 65875b94ea..4379ec77e3 100644
--- a/submodule-config.h
+++ b/submodule-config.h
@@ -6,6 +6,7 @@ 
 #include "hashmap.h"
 #include "submodule.h"
 #include "strbuf.h"
+#include "tree-walk.h"
 
 /**
  * The submodule config cache API allows to read submodule
@@ -67,6 +68,18 @@  const struct submodule *submodule_from_name(struct repository *r,
 					    const struct object_id *commit_or_tree,
 					    const char *name);
 
+struct submodule_entry_list {
+	struct name_entry *name_entries;
+	int entry_nr;
+	int entry_alloc;
+};
+
+/**
+ * Given a tree-ish, return all submodules in the tree.
+ */
+struct submodule_entry_list *
+submodules_of_tree(struct repository *r, const struct object_id *treeish_name);
+
 /**
  * Given a tree-ish in the superproject and a path, return the submodule that
  * is bound at the path in the named tree.