diff mbox series

rev-list: print missing object type with --missing=print-type

Message ID 20250108034012.211043-1-jltobler@gmail.com (mailing list archive)
State New
Headers show
Series rev-list: print missing object type with --missing=print-type | expand

Commit Message

Justin Tobler Jan. 8, 2025, 3:40 a.m. UTC
Handling of missing objects encounted by git-rev-list(1) can be
configured with the `--missing=<action>` option and specifying the
desired action. Of the available missing actions, none provide a way to
print additional information about the missing object such as its type.

Add a new missing action called `print-type`. Similar to `print`, this
action prints a list of missing objects but also includes the object
type if available in the form: `?<oid> [type]`.

Signed-off-by: Justin Tobler <jltobler@gmail.com>
---
Instead of adding an additional missing action type for the explicit
purpose of printing type info, I also considered a separate option
(something like `--object-types`, or maybe a `--missing-format`) that
could be used in combination with the `--missing=print` option to
achieve the same result. The main intent is for the missing object type
and I wasn't sure if type info would be useful in the general case so I
opted to choose the former approach.

Thanks,
-Justin
---
 Documentation/rev-list-options.txt |  3 ++
 builtin/rev-list.c                 | 74 +++++++++++++++++++++++++-----
 t/t6022-rev-list-missing.sh        | 13 +++++-
 3 files changed, 77 insertions(+), 13 deletions(-)


base-commit: b74ff38af58464688b211140b90ec90598d340c6

Comments

Christian Couder Jan. 8, 2025, 10:08 a.m. UTC | #1
On Wed, Jan 8, 2025 at 4:43 AM Justin Tobler <jltobler@gmail.com> wrote:
>
> Handling of missing objects encounted by git-rev-list(1) can be
> configured with the `--missing=<action>` option and specifying the
> desired action. Of the available missing actions, none provide a way to
> print additional information about the missing object such as its type.

What kind of additional information could we also print except for the type?

> Add a new missing action called `print-type`. Similar to `print`, this
> action prints a list of missing objects but also includes the object
> type if available in the form: `?<oid> [type]`.
>
> Signed-off-by: Justin Tobler <jltobler@gmail.com>
> ---
> Instead of adding an additional missing action type for the explicit
> purpose of printing type info, I also considered a separate option
> (something like `--object-types`, or maybe a `--missing-format`) that
> could be used in combination with the `--missing=print` option to
> achieve the same result. The main intent is for the missing object type
> and I wasn't sure if type info would be useful in the general case so I
> opted to choose the former approach.

If there are many other kinds of information we could also print, then
maybe a `--missing-format=<format>` option would make more sense
rather than adding `print-X`, `print-Y`, `print-X-Y`, etc. Otherwise,
yeah, I would say that `print-type` makes sense.

> @@ -103,6 +109,8 @@ static off_t get_object_disk_usage(struct object *obj)
>
>  static inline void finish_object__ma(struct object *obj)
>  {
> +       struct missing_objects_map_entry *entry, *old;
> +
>         /*
>          * Whether or not we try to dynamically fetch missing objects
>          * from the server, we currently DO NOT have the object.  We
> @@ -119,7 +127,12 @@ static inline void finish_object__ma(struct object *obj)
>                 return;
>
>         case MA_PRINT:
> -               oidset_insert(&missing_objects, &obj->oid);
> +       case MA_PRINT_TYPE:
> +               CALLOC_ARRAY(entry, 1);
> +               entry->entry.oid = obj->oid;
> +               entry->type = obj->type;
> +               old = oidmap_put(&missing_objects, entry);
> +               free(old);
>                 return;

Maybe a function like:

static void add_missing_object_entry(struct object_id *oid, unsigned int type)
{
    struct missing_objects_map_entry *entry, *old;

    CALLOC_ARRAY(entry, 1);
    entry->entry.oid = *oid;
    entry->type = type;
    old = oidmap_put(&missing_objects, entry);
    free(old);
}

and then:

    case MA_PRINT:
    case MA_PRINT_TYPE:
        add_missing_object_entry(&obj->oid, obj->type);
        return;

could help keep finish_object__ma() (which is inlined) short and also
avoid some code duplication in the code below.

>         case MA_ALLOW_PROMISOR:
> @@ -414,6 +427,12 @@ static inline int parse_missing_action_value(const char *value)
>                 return 1;
>         }
>
> +       if (!strcmp(value, "print-type")) {
> +               arg_missing_action = MA_PRINT_TYPE;
> +               fetch_if_missing = 0;
> +               return 1;
> +       }
> +
>         if (!strcmp(value, "allow-promisor")) {
>                 arg_missing_action = MA_ALLOW_PROMISOR;
>                 fetch_if_missing = 0;
> @@ -781,10 +800,26 @@ int cmd_rev_list(int argc,
>
>         if (arg_print_omitted)
>                 oidset_init(&omitted_objects, DEFAULT_OIDSET_SIZE);
> -       if (arg_missing_action == MA_PRINT) {
> -               oidset_init(&missing_objects, DEFAULT_OIDSET_SIZE);
> -               /* Add missing tips */
> -               oidset_insert_from_set(&missing_objects, &revs.missing_commits);
> +       if (arg_missing_action == MA_PRINT ||
> +           arg_missing_action == MA_PRINT_TYPE) {
> +               struct oidset_iter iter;
> +               struct object_id *oid;
> +
> +               oidmap_init(&missing_objects, DEFAULT_OIDSET_SIZE);
> +               oidset_iter_init(&revs.missing_commits, &iter);
> +
> +               /*
> +                * Revisions pointing to missing objects lack the context
> +                * required to determine object type.
> +                */
> +               while ((oid = oidset_iter_next(&iter))) {
> +                       struct missing_objects_map_entry *entry;
> +
> +                       CALLOC_ARRAY(entry, 1);
> +                       oidcpy(&entry->entry.oid, oid);
> +                       oidmap_put(&missing_objects, entry);
> +               }

Using the function suggested above this could be:

        /*
         * Revisions pointing to missing objects lack the context
         * required to determine object type.
         */
        while ((oid = oidset_iter_next(&iter)))
            add_missing_object_entry(oid, 0);

> +
>                 oidset_clear(&revs.missing_commits);
>         }
>
> @@ -801,12 +836,27 @@ int cmd_rev_list(int argc,
>                 oidset_clear(&omitted_objects);
>         }
>         if (arg_missing_action == MA_PRINT) {
> -               struct oidset_iter iter;
> -               struct object_id *oid;
> -               oidset_iter_init(&missing_objects, &iter);
> -               while ((oid = oidset_iter_next(&iter)))
> -                       printf("?%s\n", oid_to_hex(oid));
> -               oidset_clear(&missing_objects);
> +               struct missing_objects_map_entry *entry;
> +               struct oidmap_iter iter;
> +
> +               oidmap_iter_init(&missing_objects, &iter);
> +
> +               while ((entry = oidmap_iter_next(&iter)))
> +                       printf("?%s\n", oid_to_hex(&entry->entry.oid));
> +
> +               oidmap_free(&missing_objects, true);
> +       }
> +       if (arg_missing_action == MA_PRINT_TYPE) {
> +               struct missing_objects_map_entry *entry;
> +               struct oidmap_iter iter;
> +
> +               oidmap_iter_init(&missing_objects, &iter);
> +
> +               while ((entry = oidmap_iter_next(&iter)))
> +                       printf("?%s %s\n", oid_to_hex(&entry->entry.oid),
> +                              entry->type ? type_name(entry->type) : "");
> +
> +               oidmap_free(&missing_objects, true);
>         }

Maybe a function like:

static void print_missing_objects(unsigned int with_type)
{
    struct missing_objects_map_entry *entry;
    struct oidmap_iter iter;

    oidmap_iter_init(&missing_objects, &iter);

    while ((entry = oidmap_iter_next(&iter)))
        if (with_type && entry->type)
            printf("?%s %s\n", oid_to_hex(&entry->entry.oid),
                   type_name(entry->type));
        else
            printf("?%s\n", oid_to_hex(&entry->entry.oid));

    oidmap_free(&missing_objects, true);
}

could avoid some code duplication. It could also make the output a bit
cleaner as when there is no type, there would be no space after the
oid in the output.

Thanks.
Junio C Hamano Jan. 8, 2025, 3:17 p.m. UTC | #2
Justin Tobler <jltobler@gmail.com> writes:

> Handling of missing objects encounted by git-rev-list(1) can be
> configured with the `--missing=<action>` option and specifying the
> desired action. Of the available missing actions, none provide a way to
> print additional information about the missing object such as its type.
>
> Add a new missing action called `print-type`. Similar to `print`, this
> action prints a list of missing objects but also includes the object
> type if available in the form: `?<oid> [type]`.

This part needs to explain where the type information comes from and
what its significance is (see below for more details).

> diff --git a/Documentation/rev-list-options.txt b/Documentation/rev-list-options.txt
> index 459e5a02f5..277a0b645e 100644
> --- a/Documentation/rev-list-options.txt
> +++ b/Documentation/rev-list-options.txt
> @@ -1024,6 +1024,9 @@ Unexpected missing objects will raise an error.
>  The form '--missing=print' is like 'allow-any', but will also print a
>  list of the missing objects.  Object IDs are prefixed with a ``?'' character.
>  +
> +The form '--missing=print-type' is like 'print', but will also print the
> +missing object type information if available in the form `?<oid> [type]`.
> ++

The users need to be told what this "type" information really means,
as its meaning is quite different from what "git cat-file -t <oid>"
would give them.  We do not have the object, so we are not learning
its type from the object itself.  How much trust should the users
put in this information, for example?

That comes back to the "where does it come from" that the future
readers of "git log" and reviewers need to be told by the proposed
log message.  Knowing the internals, I know you'd be getting it from
the "containing" objects, e.g., an object name that was found on the
"parent" object header field of another commit, which is _expected_
to be a commit, or an object name that was found in a tree entry
whose mode bits were 100644, which is _expected_ to be a blob, etc.

There are other places that you _could_ glean information about
(possibly missing) objects.  An object that is found during
"rev-list --objects" traversal (which is the topic of this patch
after all) but turned out to be missing may not just have an
expected type (because it was found in a tree object that we
successfully read) but also the full path to the object in the
top-level tree, for example.

In modern Git, there are even more places that you may be able to
use, like commit-graph that not just hints the object itself is a
commit, but what its parents are and when the commit was created.

Note that I am not suggesting to implement more code to learn "type"
information from more places than the current patch is doing.  At
least not in this iteration of the patch.  What I am getting at is
that it would help us to avoid unnecessarily limiting ourselves by
stressing on "type" too much if we at least imagine what the
possible sources of these extra pieces of information are and what
they could provide.

As I suspect that we would want to leave the door open for us to
extend this later, I would perhaps suggest an output format format
like:

    ?<object name> [<token>=<value>]...

where <token> tells what kind of extra information it is.  I expect
that the initial implementation only knows about "type" as the
<token>.  For future extensibility, we only need to say that under
the syntax:

 (1) How multiple attributes are shown?
 (2) How would a <value> with SP or LF in it is represented?

My suggestion is to have multiple <token>=<value> on the same line,
with a SP in between, and problematic bytes in <value> are quoted,
using cquote(). i.e. a <token>=<value> whose <value> part does not
begin with a double-quote ends at the first SP after it, otherwise
<value> is taken as a C-quoted string inside a pair of double-quote.

If you are adventurous, I would not mind seeing "path" implemented
as another token, since that would be fairly easily obtainable, but
it does not have to be in the initial attempt.

Thanks.
Justin Tobler Jan. 8, 2025, 10:18 p.m. UTC | #3
On 25/01/08 07:17AM, Junio C Hamano wrote:
> The users need to be told what this "type" information really means,
> as its meaning is quite different from what "git cat-file -t <oid>"
> would give them.  We do not have the object, so we are not learning
> its type from the object itself.  How much trust should the users
> put in this information, for example?
> 
> That comes back to the "where does it come from" that the future
> readers of "git log" and reviewers need to be told by the proposed
> log message.  Knowing the internals, I know you'd be getting it from
> the "containing" objects, e.g., an object name that was found on the
> "parent" object header field of another commit, which is _expected_
> to be a commit, or an object name that was found in a tree entry
> whose mode bits were 100644, which is _expected_ to be a blob, etc.

I'll update the log message in the next version to explain how
information about a missing object gets inferred. As you explained, this
is relying on the containing object to figure this out. This is why for
some missing objects it may not be possible to infer the type. For
example, if a missing object is only reffered to by a reference, there
is not a containing object that can be used.

> There are other places that you _could_ glean information about
> (possibly missing) objects.  An object that is found during
> "rev-list --objects" traversal (which is the topic of this patch
> after all) but turned out to be missing may not just have an
> expected type (because it was found in a tree object that we
> successfully read) but also the full path to the object in the
> top-level tree, for example.
> 
> In modern Git, there are even more places that you may be able to
> use, like commit-graph that not just hints the object itself is a
> commit, but what its parents are and when the commit was created.
> 
> Note that I am not suggesting to implement more code to learn "type"
> information from more places than the current patch is doing.  At
> least not in this iteration of the patch.  What I am getting at is
> that it would help us to avoid unnecessarily limiting ourselves by
> stressing on "type" too much if we at least imagine what the
> possible sources of these extra pieces of information are and what
> they could provide.
> 
> As I suspect that we would want to leave the door open for us to
> extend this later, I would perhaps suggest an output format format
> like:
> 
>     ?<object name> [<token>=<value>]...

I think this is a great idea. To select which attributes get printed
with the missing object we could add an option. Something like:

  $ git rev-list --objects --missing=print \
  --missing-attr=path --missing-attr=type

I like the idea of also adding a path attribute, but this raises a
couple of questions. The way `--missing=print` currently works is that
it prints the unique set of missing object IDs. A missing object could
possibly be referenced by multiple trees and thus have multiple valid
paths. There are a couple of different ways this situation could be
handled:

  - We could record each of the encounted paths for an object and print
    out each. Something like: `?<oid> path=foo path=bar`

  - Historically, `--missing=print` would only ever print a single
    instance of the OID, but we could print a missing object with
    multiple paths each on a separate line. Something like this:
          
      ?e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 path=foo
      ?e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 path=bar

  - We could keep it simple a just report a single path per missing
    object. It doesn't capture the whole picture, but it does provide
    some insight into the missing object.

Since this missing object type is also inferred from the containing
object, in theory different containing objects could indicate different
types for the missing object. I'm not sure though if this is a scenario
worth accounting for. Maybe it would be fine to rely on a single
containing object to provide the type and assume it is consistent across
the others.

For paths, I'm currently leaning towards having each identified path
printed out as a separate attribute on the same line. For types, I'm
thinking we can just print a single type and assume it is consistent.
I'm certainly open to suggestions though. :)

> where <token> tells what kind of extra information it is.  I expect
> that the initial implementation only knows about "type" as the
> <token>.  For future extensibility, we only need to say that under
> the syntax:
> 
>  (1) How multiple attributes are shown?
>  (2) How would a <value> with SP or LF in it is represented?
> 
> My suggestion is to have multiple <token>=<value> on the same line,
> with a SP in between, and problematic bytes in <value> are quoted,
> using cquote(). i.e. a <token>=<value> whose <value> part does not
> begin with a double-quote ends at the first SP after it, otherwise
> <value> is taken as a C-quoted string inside a pair of double-quote.

Ok, so using a path value as an example, if it contains a SP, or a
special character that would be handled by `quote_c_style()` it should
be wrapped in double-quotes to indicate that it is all part of the
single attribute. That makes sense.

> If you are adventurous, I would not mind seeing "path" implemented
> as another token, since that would be fairly easily obtainable, but
> it does not have to be in the initial attempt.

Handling paths appears pretty straight-forward to add and seems like a
good idea. In my next version I'll also add support for a path
attribute. Thanks for the feedback.

-Justin
Justin Tobler Jan. 8, 2025, 10:28 p.m. UTC | #4
On 25/01/08 11:08AM, Christian Couder wrote:
> On Wed, Jan 8, 2025 at 4:43 AM Justin Tobler <jltobler@gmail.com> wrote:
> >
> > Handling of missing objects encounted by git-rev-list(1) can be
> > configured with the `--missing=<action>` option and specifying the
> > desired action. Of the available missing actions, none provide a way to
> > print additional information about the missing object such as its type.
> 
> What kind of additional information could we also print except for the type?

In [1], Junio suggested path information as another option.

> 
> > Add a new missing action called `print-type`. Similar to `print`, this
> > action prints a list of missing objects but also includes the object
> > type if available in the form: `?<oid> [type]`.
> >
> > Signed-off-by: Justin Tobler <jltobler@gmail.com>
> > ---
> > Instead of adding an additional missing action type for the explicit
> > purpose of printing type info, I also considered a separate option
> > (something like `--object-types`, or maybe a `--missing-format`) that
> > could be used in combination with the `--missing=print` option to
> > achieve the same result. The main intent is for the missing object type
> > and I wasn't sure if type info would be useful in the general case so I
> > opted to choose the former approach.
> 
> If there are many other kinds of information we could also print, then
> maybe a `--missing-format=<format>` option would make more sense
> rather than adding `print-X`, `print-Y`, `print-X-Y`, etc. Otherwise,
> yeah, I would say that `print-type` makes sense.

Since there is some other information that we may also want to print, I
think it makes sense to follow a more generic approach now. I'm
currently thinking could continue to use `--missing=print`, but add a
`--missing-attr` option to specify additional information we want to
print. Something like:

  $ git rev-list --objects --missing=print \
  --missing-attr=type --missing-attr=type

> > @@ -103,6 +109,8 @@ static off_t get_object_disk_usage(struct object *obj)
> >
> >  static inline void finish_object__ma(struct object *obj)
> >  {
> > +       struct missing_objects_map_entry *entry, *old;
> > +
> >         /*
> >          * Whether or not we try to dynamically fetch missing objects
> >          * from the server, we currently DO NOT have the object.  We
> > @@ -119,7 +127,12 @@ static inline void finish_object__ma(struct object *obj)
> >                 return;
> >
> >         case MA_PRINT:
> > -               oidset_insert(&missing_objects, &obj->oid);
> > +       case MA_PRINT_TYPE:
> > +               CALLOC_ARRAY(entry, 1);
> > +               entry->entry.oid = obj->oid;
> > +               entry->type = obj->type;
> > +               old = oidmap_put(&missing_objects, entry);
> > +               free(old);
> >                 return;
> 
> Maybe a function like:
> 
> static void add_missing_object_entry(struct object_id *oid, unsigned int type)
> {
>     struct missing_objects_map_entry *entry, *old;
> 
>     CALLOC_ARRAY(entry, 1);
>     entry->entry.oid = *oid;
>     entry->type = type;
>     old = oidmap_put(&missing_objects, entry);
>     free(old);
> }
> 
> and then:
> 
>     case MA_PRINT:
>     case MA_PRINT_TYPE:
>         add_missing_object_entry(&obj->oid, obj->type);
>         return;
> 
> could help keep finish_object__ma() (which is inlined) short and also
> avoid some code duplication in the code below.

Good idea, I'll factor this out in the next version.

> 
> >         case MA_ALLOW_PROMISOR:
> > @@ -414,6 +427,12 @@ static inline int parse_missing_action_value(const char *value)
> >                 return 1;
> >         }
> >
> > +       if (!strcmp(value, "print-type")) {
> > +               arg_missing_action = MA_PRINT_TYPE;
> > +               fetch_if_missing = 0;
> > +               return 1;
> > +       }
> > +
> >         if (!strcmp(value, "allow-promisor")) {
> >                 arg_missing_action = MA_ALLOW_PROMISOR;
> >                 fetch_if_missing = 0;
> > @@ -781,10 +800,26 @@ int cmd_rev_list(int argc,
> >
> >         if (arg_print_omitted)
> >                 oidset_init(&omitted_objects, DEFAULT_OIDSET_SIZE);
> > -       if (arg_missing_action == MA_PRINT) {
> > -               oidset_init(&missing_objects, DEFAULT_OIDSET_SIZE);
> > -               /* Add missing tips */
> > -               oidset_insert_from_set(&missing_objects, &revs.missing_commits);
> > +       if (arg_missing_action == MA_PRINT ||
> > +           arg_missing_action == MA_PRINT_TYPE) {
> > +               struct oidset_iter iter;
> > +               struct object_id *oid;
> > +
> > +               oidmap_init(&missing_objects, DEFAULT_OIDSET_SIZE);
> > +               oidset_iter_init(&revs.missing_commits, &iter);
> > +
> > +               /*
> > +                * Revisions pointing to missing objects lack the context
> > +                * required to determine object type.
> > +                */
> > +               while ((oid = oidset_iter_next(&iter))) {
> > +                       struct missing_objects_map_entry *entry;
> > +
> > +                       CALLOC_ARRAY(entry, 1);
> > +                       oidcpy(&entry->entry.oid, oid);
> > +                       oidmap_put(&missing_objects, entry);
> > +               }
> 
> Using the function suggested above this could be:
> 
>         /*
>          * Revisions pointing to missing objects lack the context
>          * required to determine object type.
>          */
>         while ((oid = oidset_iter_next(&iter)))
>             add_missing_object_entry(oid, 0);
> 
> > +
> >                 oidset_clear(&revs.missing_commits);
> >         }
> >
> > @@ -801,12 +836,27 @@ int cmd_rev_list(int argc,
> >                 oidset_clear(&omitted_objects);
> >         }
> >         if (arg_missing_action == MA_PRINT) {
> > -               struct oidset_iter iter;
> > -               struct object_id *oid;
> > -               oidset_iter_init(&missing_objects, &iter);
> > -               while ((oid = oidset_iter_next(&iter)))
> > -                       printf("?%s\n", oid_to_hex(oid));
> > -               oidset_clear(&missing_objects);
> > +               struct missing_objects_map_entry *entry;
> > +               struct oidmap_iter iter;
> > +
> > +               oidmap_iter_init(&missing_objects, &iter);
> > +
> > +               while ((entry = oidmap_iter_next(&iter)))
> > +                       printf("?%s\n", oid_to_hex(&entry->entry.oid));
> > +
> > +               oidmap_free(&missing_objects, true);
> > +       }
> > +       if (arg_missing_action == MA_PRINT_TYPE) {
> > +               struct missing_objects_map_entry *entry;
> > +               struct oidmap_iter iter;
> > +
> > +               oidmap_iter_init(&missing_objects, &iter);
> > +
> > +               while ((entry = oidmap_iter_next(&iter)))
> > +                       printf("?%s %s\n", oid_to_hex(&entry->entry.oid),
> > +                              entry->type ? type_name(entry->type) : "");
> > +
> > +               oidmap_free(&missing_objects, true);
> >         }
> 
> Maybe a function like:
> 
> static void print_missing_objects(unsigned int with_type)
> {
>     struct missing_objects_map_entry *entry;
>     struct oidmap_iter iter;
> 
>     oidmap_iter_init(&missing_objects, &iter);
> 
>     while ((entry = oidmap_iter_next(&iter)))
>         if (with_type && entry->type)
>             printf("?%s %s\n", oid_to_hex(&entry->entry.oid),
>                    type_name(entry->type));
>         else
>             printf("?%s\n", oid_to_hex(&entry->entry.oid));
> 
>     oidmap_free(&missing_objects, true);
> }
> 
> could avoid some code duplication. It could also make the output a bit
> cleaner as when there is no type, there would be no space after the
> oid in the output.

I agree with this suggestion and will also factor this out in the next
version. Thanks!

-Justin
Junio C Hamano Jan. 8, 2025, 10:43 p.m. UTC | #5
Justin Tobler <jltobler@gmail.com> writes:

>> As I suspect that we would want to leave the door open for us to
>> extend this later, I would perhaps suggest an output format format
>> like:
>> 
>>     ?<object name> [<token>=<value>]...
>
> I think this is a great idea. To select which attributes get printed
> with the missing object we could add an option. Something like:
>
>   $ git rev-list --objects --missing=print \
>   --missing-attr=path --missing-attr=type

My knee-jerk reaction is that this is over-engineered; wouldn't it
be possible for us to simply dump everything we know about the
object, and let the receiving end pick and choose?

> I like the idea of also adding a path attribute, but this raises a
> couple of questions. The way `--missing=print` currently works is that
> it prints the unique set of missing object IDs. A missing object could
> possibly be referenced by multiple trees and thus have multiple valid
> paths.

That is not an issue at all, I think.  "rev-list --objects" that
shows objects that are not missing already has the same issue, and
the solution is "show the path when the object gets shown for the
first time".  Even when the same object is encountered during the
history-and-then-tree walk later, that object is simply not listed.

The code path that collects "I thought this blob should exist
because a tree wants to see it at this path, but the repository is
corrupt and I cannot see it there" into the missing object table
with attributes should do the same.  If the table does not yet have
the object, record the attributes (like "expected type", "path at
which the object was found") when inserting the object into the
table for the first time.  If you have a missing object and the
table already has it recorded there, don't do anything extra.
Justin Tobler Jan. 8, 2025, 11:13 p.m. UTC | #6
On 25/01/08 02:43PM, Junio C Hamano wrote:
> Justin Tobler <jltobler@gmail.com> writes:
> 
> >> As I suspect that we would want to leave the door open for us to
> >> extend this later, I would perhaps suggest an output format format
> >> like:
> >> 
> >>     ?<object name> [<token>=<value>]...
> >
> > I think this is a great idea. To select which attributes get printed
> > with the missing object we could add an option. Something like:
> >
> >   $ git rev-list --objects --missing=print \
> >   --missing-attr=path --missing-attr=type
> 
> My knee-jerk reaction is that this is over-engineered; wouldn't it
> be possible for us to simply dump everything we know about the
> object, and let the receiving end pick and choose?

I think that should also be fine and much simpler. We may not want to
effect the existing output of `--missing=print` though, so we could have
a simple boolean option like `--missing-attr` or just add a new missing
action type like `--missing=print-attr`. When enabled it would just
print all the identified attributes.

If we don't care though, we could keep it very simple and just add all
the information whenever `--missing=print` is set. 

> > I like the idea of also adding a path attribute, but this raises a
> > couple of questions. The way `--missing=print` currently works is that
> > it prints the unique set of missing object IDs. A missing object could
> > possibly be referenced by multiple trees and thus have multiple valid
> > paths.
> 
> That is not an issue at all, I think.  "rev-list --objects" that
> shows objects that are not missing already has the same issue, and
> the solution is "show the path when the object gets shown for the
> first time".  Even when the same object is encountered during the
> history-and-then-tree walk later, that object is simply not listed.
> 
> The code path that collects "I thought this blob should exist
> because a tree wants to see it at this path, but the repository is
> corrupt and I cannot see it there" into the missing object table
> with attributes should do the same.  If the table does not yet have
> the object, record the attributes (like "expected type", "path at
> which the object was found") when inserting the object into the
> table for the first time.  If you have a missing object and the
> table already has it recorded there, don't do anything extra.

Good to know! That makes things a bit simpler. I'll follow this approach
and only add the missing object to the table if an object with the same
OID is not already recorded.

-Justin
diff mbox series

Patch

diff --git a/Documentation/rev-list-options.txt b/Documentation/rev-list-options.txt
index 459e5a02f5..277a0b645e 100644
--- a/Documentation/rev-list-options.txt
+++ b/Documentation/rev-list-options.txt
@@ -1024,6 +1024,9 @@  Unexpected missing objects will raise an error.
 The form '--missing=print' is like 'allow-any', but will also print a
 list of the missing objects.  Object IDs are prefixed with a ``?'' character.
 +
+The form '--missing=print-type' is like 'print', but will also print the
+missing object type information if available in the form `?<oid> [type]`.
++
 If some tips passed to the traversal are missing, they will be
 considered as missing too, and the traversal will ignore them. In case
 we cannot get their Object ID though, an error will be raised.
diff --git a/builtin/rev-list.c b/builtin/rev-list.c
index 3196da7b2d..ee473da52c 100644
--- a/builtin/rev-list.c
+++ b/builtin/rev-list.c
@@ -22,6 +22,7 @@ 
 #include "progress.h"
 #include "reflog-walk.h"
 #include "oidset.h"
+#include "oidmap.h"
 #include "packfile.h"
 
 static const char rev_list_usage[] =
@@ -73,11 +74,16 @@  static unsigned progress_counter;
 static struct oidset omitted_objects;
 static int arg_print_omitted; /* print objects omitted by filter */
 
-static struct oidset missing_objects;
+struct missing_objects_map_entry {
+	struct oidmap_entry entry;
+	unsigned type : TYPE_BITS;
+};
+static struct oidmap missing_objects;
 enum missing_action {
 	MA_ERROR = 0,    /* fail if any missing objects are encountered */
 	MA_ALLOW_ANY,    /* silently allow ALL missing objects */
 	MA_PRINT,        /* print ALL missing objects in special section */
+	MA_PRINT_TYPE,   /* same as MA_PRINT but includes available type info */
 	MA_ALLOW_PROMISOR, /* silently allow all missing PROMISOR objects */
 };
 static enum missing_action arg_missing_action;
@@ -103,6 +109,8 @@  static off_t get_object_disk_usage(struct object *obj)
 
 static inline void finish_object__ma(struct object *obj)
 {
+	struct missing_objects_map_entry *entry, *old;
+
 	/*
 	 * Whether or not we try to dynamically fetch missing objects
 	 * from the server, we currently DO NOT have the object.  We
@@ -119,7 +127,12 @@  static inline void finish_object__ma(struct object *obj)
 		return;
 
 	case MA_PRINT:
-		oidset_insert(&missing_objects, &obj->oid);
+	case MA_PRINT_TYPE:
+		CALLOC_ARRAY(entry, 1);
+		entry->entry.oid = obj->oid;
+		entry->type = obj->type;
+		old = oidmap_put(&missing_objects, entry);
+		free(old);
 		return;
 
 	case MA_ALLOW_PROMISOR:
@@ -414,6 +427,12 @@  static inline int parse_missing_action_value(const char *value)
 		return 1;
 	}
 
+	if (!strcmp(value, "print-type")) {
+		arg_missing_action = MA_PRINT_TYPE;
+		fetch_if_missing = 0;
+		return 1;
+	}
+
 	if (!strcmp(value, "allow-promisor")) {
 		arg_missing_action = MA_ALLOW_PROMISOR;
 		fetch_if_missing = 0;
@@ -781,10 +800,26 @@  int cmd_rev_list(int argc,
 
 	if (arg_print_omitted)
 		oidset_init(&omitted_objects, DEFAULT_OIDSET_SIZE);
-	if (arg_missing_action == MA_PRINT) {
-		oidset_init(&missing_objects, DEFAULT_OIDSET_SIZE);
-		/* Add missing tips */
-		oidset_insert_from_set(&missing_objects, &revs.missing_commits);
+	if (arg_missing_action == MA_PRINT ||
+	    arg_missing_action == MA_PRINT_TYPE) {
+		struct oidset_iter iter;
+		struct object_id *oid;
+
+		oidmap_init(&missing_objects, DEFAULT_OIDSET_SIZE);
+		oidset_iter_init(&revs.missing_commits, &iter);
+
+		/*
+		 * Revisions pointing to missing objects lack the context
+		 * required to determine object type.
+		 */
+		while ((oid = oidset_iter_next(&iter))) {
+			struct missing_objects_map_entry *entry;
+
+			CALLOC_ARRAY(entry, 1);
+			oidcpy(&entry->entry.oid, oid);
+			oidmap_put(&missing_objects, entry);
+		}
+
 		oidset_clear(&revs.missing_commits);
 	}
 
@@ -801,12 +836,27 @@  int cmd_rev_list(int argc,
 		oidset_clear(&omitted_objects);
 	}
 	if (arg_missing_action == MA_PRINT) {
-		struct oidset_iter iter;
-		struct object_id *oid;
-		oidset_iter_init(&missing_objects, &iter);
-		while ((oid = oidset_iter_next(&iter)))
-			printf("?%s\n", oid_to_hex(oid));
-		oidset_clear(&missing_objects);
+		struct missing_objects_map_entry *entry;
+		struct oidmap_iter iter;
+
+		oidmap_iter_init(&missing_objects, &iter);
+
+		while ((entry = oidmap_iter_next(&iter)))
+			printf("?%s\n", oid_to_hex(&entry->entry.oid));
+
+		oidmap_free(&missing_objects, true);
+	}
+	if (arg_missing_action == MA_PRINT_TYPE) {
+		struct missing_objects_map_entry *entry;
+		struct oidmap_iter iter;
+
+		oidmap_iter_init(&missing_objects, &iter);
+
+		while ((entry = oidmap_iter_next(&iter)))
+			printf("?%s %s\n", oid_to_hex(&entry->entry.oid),
+			       entry->type ? type_name(entry->type) : "");
+
+		oidmap_free(&missing_objects, true);
 	}
 
 	stop_progress(&progress);
diff --git a/t/t6022-rev-list-missing.sh b/t/t6022-rev-list-missing.sh
index 7553a9cca2..1606082d4b 100755
--- a/t/t6022-rev-list-missing.sh
+++ b/t/t6022-rev-list-missing.sh
@@ -37,7 +37,7 @@  done
 
 for obj in "HEAD~1" "HEAD~1^{tree}" "HEAD:1.t"
 do
-	for action in "allow-any" "print"
+	for action in "allow-any" "print" "print-type"
 	do
 		test_expect_success "rev-list --missing=$action with missing $obj" '
 			oid="$(git rev-parse $obj)" &&
@@ -48,6 +48,13 @@  do
 			git rev-list --objects --no-object-names \
 				HEAD ^$obj >expect.raw &&
 
+			# When the action is to print the type, we should also
+			# record the expected type of the missing object.
+			if test "$action" = "print-type"
+			then
+				type="$(git cat-file -t $obj)"
+			fi &&
+
 			# Blobs are shared by all commits, so even though a commit/tree
 			# might be skipped, its blob must be accounted for.
 			if test $obj != "HEAD:1.t"
@@ -71,6 +78,10 @@  do
 				grep ?$oid actual.raw &&
 				echo ?$oid >>expect.raw
 				;;
+			print-type)
+				grep "?$oid $type" actual.raw &&
+				echo "?$oid $type" >>expect.raw
+				;;
 			esac &&
 
 			sort actual.raw >actual &&