diff mbox series

[v4,1/5] object-file: refactor write_loose_object() to read buffer from stream

Message ID 20211203093530.93589-2-chiyutianyi@gmail.com (mailing list archive)
State Superseded
Headers show
Series unpack large objects in stream | expand

Commit Message

Han Xin Dec. 3, 2021, 9:35 a.m. UTC
From: Han Xin <hanxin.hx@alibaba-inc.com>

We used to call "get_data()" in "unpack_non_delta_entry()" to read the
entire contents of a blob object, no matter how big it is. This
implementation may consume all the memory and cause OOM.

This can be improved by feeding data to "write_loose_object()" in a
stream. The input stream is implemented as an interface. In the first
step, we make a simple implementation, feeding the entire buffer in the
"stream" to "write_loose_object()" as a refactor.

Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 object-file.c  | 53 ++++++++++++++++++++++++++++++++++++++++++++++----
 object-store.h |  6 ++++++
 2 files changed, 55 insertions(+), 4 deletions(-)

Comments

Ævar Arnfjörð Bjarmason Dec. 3, 2021, 1:28 p.m. UTC | #1
On Fri, Dec 03 2021, Han Xin wrote:

> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> We used to call "get_data()" in "unpack_non_delta_entry()" to read the
> entire contents of a blob object, no matter how big it is. This
> implementation may consume all the memory and cause OOM.
>
> This can be improved by feeding data to "write_loose_object()" in a
> stream. The input stream is implemented as an interface. In the first
> step, we make a simple implementation, feeding the entire buffer in the
> "stream" to "write_loose_object()" as a refactor.
>
> Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> ---
>  object-file.c  | 53 ++++++++++++++++++++++++++++++++++++++++++++++----
>  object-store.h |  6 ++++++
>  2 files changed, 55 insertions(+), 4 deletions(-)
>
> diff --git a/object-file.c b/object-file.c
> index eb972cdccd..82656f7428 100644
> --- a/object-file.c
> +++ b/object-file.c
> @@ -1860,8 +1860,26 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
>  	return fd;
>  }
>  
> +struct simple_input_stream_data {
> +	const void *buf;
> +	unsigned long len;
> +};

I see why you picked "const void *buf" here, over say const char *, it's
what "struct input_stream" uses.

But why not use size_t for the length, as input_stream does?

> +static const void *feed_simple_input_stream(struct input_stream *in_stream, unsigned long *len)
> +{
> +	struct simple_input_stream_data *data = in_stream->data;
> +
> +	if (data->len == 0) {

nit: if (!data->len)...

> +		*len = 0;
> +		return NULL;
> +	}
> +	*len = data->len;
> +	data->len = 0;
> +	return data->buf;

But isn't the body of this functin the same as:

        *len = data->len;
        if (!len)
                return NULL;
        data->len = 0;
        return data->buf;

I.e. you don't need the condition for setting "*len" if it's 0, then
data->len is also 0. You just want to return NULL afterwards, and not
set (harmless, but no need) data->len to 0)< or return data->buf.
> +	struct input_stream in_stream = {
> +		.read = feed_simple_input_stream,
> +		.data = (void *)&(struct simple_input_stream_data) {
> +			.buf = buf,
> +			.len = len,
> +		},
> +		.size = len,
> +	};

Maybe it's that I'm unused to it, but I find this a bit more readable:
	
	@@ -2013,12 +2011,13 @@ int write_object_file_flags(const void *buf, unsigned long len,
	 {
	 	char hdr[MAX_HEADER_LEN];
	 	int hdrlen = sizeof(hdr);
	+	struct simple_input_stream_data tmp = {
	+		.buf = buf,
	+		.len = len,
	+	};
	 	struct input_stream in_stream = {
	 		.read = feed_simple_input_stream,
	-		.data = (void *)&(struct simple_input_stream_data) {
	-			.buf = buf,
	-			.len = len,
	-		},
	+		.data = (void *)&tmp,
	 		.size = len,
	 	};
	
Yes there's a temporary variable, but no denser inline casting. Also
easier to strep through in a debugger (which will have the type
information on "tmp".

>  int hash_object_file_literally(const void *buf, unsigned long len,
> @@ -1977,6 +2006,14 @@ int hash_object_file_literally(const void *buf, unsigned long len,
>  {
>  	char *header;
>  	int hdrlen, status = 0;
> +	struct input_stream in_stream = {
> +		.read = feed_simple_input_stream,
> +		.data = (void *)&(struct simple_input_stream_data) {
> +			.buf = buf,
> +			.len = len,
> +		},
> +		.size = len,
> +	};

ditto..

>  	/* type string, SP, %lu of the length plus NUL must fit this */
>  	hdrlen = strlen(type) + MAX_HEADER_LEN;
> @@ -1988,7 +2025,7 @@ int hash_object_file_literally(const void *buf, unsigned long len,
>  		goto cleanup;
>  	if (freshen_packed_object(oid) || freshen_loose_object(oid))
>  		goto cleanup;
> -	status = write_loose_object(oid, header, hdrlen, buf, len, 0, 0);
> +	status = write_loose_object(oid, header, hdrlen, &in_stream, 0, 0);
>  
>  cleanup:
>  	free(header);
> @@ -2003,14 +2040,22 @@ int force_object_loose(const struct object_id *oid, time_t mtime)
>  	char hdr[MAX_HEADER_LEN];
>  	int hdrlen;
>  	int ret;
> +	struct simple_input_stream_data data;
> +	struct input_stream in_stream = {
> +		.read = feed_simple_input_stream,
> +		.data = &data,
> +	};
>  
>  	if (has_loose_object(oid))
>  		return 0;
>  	buf = read_object(the_repository, oid, &type, &len);
> +	in_stream.size = len;

Why are we setting this here?...

>  	if (!buf)
>  		return error(_("cannot read object for %s"), oid_to_hex(oid));

...Insted of after this point, as we may error and never use it?

> +	data.buf = buf;
> +	data.len = len;

Probably won't matter,  just a nit...

> +struct input_stream {
> +	const void *(*read)(struct input_stream *, unsigned long *len);
> +	void *data;
> +	size_t size;
> +};
> +

Ah, and here's the size_t... :)
Han Xin Dec. 6, 2021, 2:07 a.m. UTC | #2
On Fri, Dec 3, 2021 at 9:41 PM Ævar Arnfjörð Bjarmason <avarab@gmail.com> wrote:
>
>
> On Fri, Dec 03 2021, Han Xin wrote:
>
> > From: Han Xin <hanxin.hx@alibaba-inc.com>
> >
> > We used to call "get_data()" in "unpack_non_delta_entry()" to read the
> > entire contents of a blob object, no matter how big it is. This
> > implementation may consume all the memory and cause OOM.
> >
> > This can be improved by feeding data to "write_loose_object()" in a
> > stream. The input stream is implemented as an interface. In the first
> > step, we make a simple implementation, feeding the entire buffer in the
> > "stream" to "write_loose_object()" as a refactor.
> >
> > Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
> > Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> > ---
> >  object-file.c  | 53 ++++++++++++++++++++++++++++++++++++++++++++++----
> >  object-store.h |  6 ++++++
> >  2 files changed, 55 insertions(+), 4 deletions(-)
> >
> > diff --git a/object-file.c b/object-file.c
> > index eb972cdccd..82656f7428 100644
> > --- a/object-file.c
> > +++ b/object-file.c
> > @@ -1860,8 +1860,26 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
> >       return fd;
> >  }
> >
> > +struct simple_input_stream_data {
> > +     const void *buf;
> > +     unsigned long len;
> > +};
>
> I see why you picked "const void *buf" here, over say const char *, it's
> what "struct input_stream" uses.
>
> But why not use size_t for the length, as input_stream does?
>

Yes, "size_t" will be better here.

> > +static const void *feed_simple_input_stream(struct input_stream *in_stream, unsigned long *len)
> > +{
> > +     struct simple_input_stream_data *data = in_stream->data;
> > +
> > +     if (data->len == 0) {
>
> nit: if (!data->len)...
>

Will apply.

> > +             *len = 0;
> > +             return NULL;
> > +     }
> > +     *len = data->len;
> > +     data->len = 0;
> > +     return data->buf;
>
> But isn't the body of this functin the same as:
>
>         *len = data->len;
>         if (!len)
>                 return NULL;
>         data->len = 0;
>         return data->buf;
>
> I.e. you don't need the condition for setting "*len" if it's 0, then
> data->len is also 0. You just want to return NULL afterwards, and not
> set (harmless, but no need) data->len to 0)< or return data->buf.

Will apply.

> > +     struct input_stream in_stream = {
> > +             .read = feed_simple_input_stream,
> > +             .data = (void *)&(struct simple_input_stream_data) {
> > +                     .buf = buf,
> > +                     .len = len,
> > +             },
> > +             .size = len,
> > +     };
>
> Maybe it's that I'm unused to it, but I find this a bit more readable:
>
>         @@ -2013,12 +2011,13 @@ int write_object_file_flags(const void *buf, unsigned long len,
>          {
>                 char hdr[MAX_HEADER_LEN];
>                 int hdrlen = sizeof(hdr);
>         +       struct simple_input_stream_data tmp = {
>         +               .buf = buf,
>         +               .len = len,
>         +       };
>                 struct input_stream in_stream = {
>                         .read = feed_simple_input_stream,
>         -               .data = (void *)&(struct simple_input_stream_data) {
>         -                       .buf = buf,
>         -                       .len = len,
>         -               },
>         +               .data = (void *)&tmp,
>                         .size = len,
>                 };
>
> Yes there's a temporary variable, but no denser inline casting. Also
> easier to strep through in a debugger (which will have the type
> information on "tmp".
>

Will apply.

> >  int hash_object_file_literally(const void *buf, unsigned long len,
> > @@ -1977,6 +2006,14 @@ int hash_object_file_literally(const void *buf, unsigned long len,
> >  {
> >       char *header;
> >       int hdrlen, status = 0;
> > +     struct input_stream in_stream = {
> > +             .read = feed_simple_input_stream,
> > +             .data = (void *)&(struct simple_input_stream_data) {
> > +                     .buf = buf,
> > +                     .len = len,
> > +             },
> > +             .size = len,
> > +     };
>
> ditto..
>
> >       /* type string, SP, %lu of the length plus NUL must fit this */
> >       hdrlen = strlen(type) + MAX_HEADER_LEN;
> > @@ -1988,7 +2025,7 @@ int hash_object_file_literally(const void *buf, unsigned long len,
> >               goto cleanup;
> >       if (freshen_packed_object(oid) || freshen_loose_object(oid))
> >               goto cleanup;
> > -     status = write_loose_object(oid, header, hdrlen, buf, len, 0, 0);
> > +     status = write_loose_object(oid, header, hdrlen, &in_stream, 0, 0);
> >
> >  cleanup:
> >       free(header);
> > @@ -2003,14 +2040,22 @@ int force_object_loose(const struct object_id *oid, time_t mtime)
> >       char hdr[MAX_HEADER_LEN];
> >       int hdrlen;
> >       int ret;
> > +     struct simple_input_stream_data data;
> > +     struct input_stream in_stream = {
> > +             .read = feed_simple_input_stream,
> > +             .data = &data,
> > +     };
> >
> >       if (has_loose_object(oid))
> >               return 0;
> >       buf = read_object(the_repository, oid, &type, &len);
> > +     in_stream.size = len;
>
> Why are we setting this here?...
>

Yes, putting "in_stream.size=len;" here was a stupid decision.

> >       if (!buf)
> >               return error(_("cannot read object for %s"), oid_to_hex(oid));
>
> ...Insted of after this point, as we may error and never use it?
>
> > +     data.buf = buf;
> > +     data.len = len;
>
> Probably won't matter,  just a nit...
>
> > +struct input_stream {
> > +     const void *(*read)(struct input_stream *, unsigned long *len);
> > +     void *data;
> > +     size_t size;
> > +};
> > +
>
> Ah, and here's the size_t... :)
diff mbox series

Patch

diff --git a/object-file.c b/object-file.c
index eb972cdccd..82656f7428 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1860,8 +1860,26 @@  static int create_tmpfile(struct strbuf *tmp, const char *filename)
 	return fd;
 }
 
+struct simple_input_stream_data {
+	const void *buf;
+	unsigned long len;
+};
+
+static const void *feed_simple_input_stream(struct input_stream *in_stream, unsigned long *len)
+{
+	struct simple_input_stream_data *data = in_stream->data;
+
+	if (data->len == 0) {
+		*len = 0;
+		return NULL;
+	}
+	*len = data->len;
+	data->len = 0;
+	return data->buf;
+}
+
 static int write_loose_object(const struct object_id *oid, char *hdr,
-			      int hdrlen, const void *buf, unsigned long len,
+			      int hdrlen, struct input_stream *in_stream,
 			      time_t mtime, unsigned flags)
 {
 	int fd, ret;
@@ -1871,6 +1889,8 @@  static int write_loose_object(const struct object_id *oid, char *hdr,
 	struct object_id parano_oid;
 	static struct strbuf tmp_file = STRBUF_INIT;
 	static struct strbuf filename = STRBUF_INIT;
+	const void *buf;
+	unsigned long len;
 
 	loose_object_path(the_repository, &filename, oid);
 
@@ -1898,6 +1918,7 @@  static int write_loose_object(const struct object_id *oid, char *hdr,
 	the_hash_algo->update_fn(&c, hdr, hdrlen);
 
 	/* Then the data itself.. */
+	buf = in_stream->read(in_stream, &len);
 	stream.next_in = (void *)buf;
 	stream.avail_in = len;
 	do {
@@ -1960,6 +1981,14 @@  int write_object_file_flags(const void *buf, unsigned long len,
 {
 	char hdr[MAX_HEADER_LEN];
 	int hdrlen = sizeof(hdr);
+	struct input_stream in_stream = {
+		.read = feed_simple_input_stream,
+		.data = (void *)&(struct simple_input_stream_data) {
+			.buf = buf,
+			.len = len,
+		},
+		.size = len,
+	};
 
 	/* Normally if we have it in the pack then we do not bother writing
 	 * it out into .git/objects/??/?{38} file.
@@ -1968,7 +1997,7 @@  int write_object_file_flags(const void *buf, unsigned long len,
 				  &hdrlen);
 	if (freshen_packed_object(oid) || freshen_loose_object(oid))
 		return 0;
-	return write_loose_object(oid, hdr, hdrlen, buf, len, 0, flags);
+	return write_loose_object(oid, hdr, hdrlen, &in_stream, 0, flags);
 }
 
 int hash_object_file_literally(const void *buf, unsigned long len,
@@ -1977,6 +2006,14 @@  int hash_object_file_literally(const void *buf, unsigned long len,
 {
 	char *header;
 	int hdrlen, status = 0;
+	struct input_stream in_stream = {
+		.read = feed_simple_input_stream,
+		.data = (void *)&(struct simple_input_stream_data) {
+			.buf = buf,
+			.len = len,
+		},
+		.size = len,
+	};
 
 	/* type string, SP, %lu of the length plus NUL must fit this */
 	hdrlen = strlen(type) + MAX_HEADER_LEN;
@@ -1988,7 +2025,7 @@  int hash_object_file_literally(const void *buf, unsigned long len,
 		goto cleanup;
 	if (freshen_packed_object(oid) || freshen_loose_object(oid))
 		goto cleanup;
-	status = write_loose_object(oid, header, hdrlen, buf, len, 0, 0);
+	status = write_loose_object(oid, header, hdrlen, &in_stream, 0, 0);
 
 cleanup:
 	free(header);
@@ -2003,14 +2040,22 @@  int force_object_loose(const struct object_id *oid, time_t mtime)
 	char hdr[MAX_HEADER_LEN];
 	int hdrlen;
 	int ret;
+	struct simple_input_stream_data data;
+	struct input_stream in_stream = {
+		.read = feed_simple_input_stream,
+		.data = &data,
+	};
 
 	if (has_loose_object(oid))
 		return 0;
 	buf = read_object(the_repository, oid, &type, &len);
+	in_stream.size = len;
 	if (!buf)
 		return error(_("cannot read object for %s"), oid_to_hex(oid));
+	data.buf = buf;
+	data.len = len;
 	hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(type), (uintmax_t)len) + 1;
-	ret = write_loose_object(oid, hdr, hdrlen, buf, len, mtime, 0);
+	ret = write_loose_object(oid, hdr, hdrlen, &in_stream, mtime, 0);
 	free(buf);
 
 	return ret;
diff --git a/object-store.h b/object-store.h
index 952efb6a4b..a84d891d60 100644
--- a/object-store.h
+++ b/object-store.h
@@ -34,6 +34,12 @@  struct object_directory {
 	char *path;
 };
 
+struct input_stream {
+	const void *(*read)(struct input_stream *, unsigned long *len);
+	void *data;
+	size_t size;
+};
+
 KHASH_INIT(odb_path_map, const char * /* key: odb_path */,
 	struct object_directory *, 1, fspathhash, fspatheq)