diff mbox series

[v8,1/6] unpack-objects: low memory footprint for get_data() in dry_run mode

Message ID 20220108085419.79682-2-chiyutianyi@gmail.com (mailing list archive)
State New, archived
Headers show
Series unpack large blobs in stream | expand

Commit Message

Han Xin Jan. 8, 2022, 8:54 a.m. UTC
From: Han Xin <hanxin.hx@alibaba-inc.com>

As the name implies, "get_data(size)" will allocate and return a given
size of memory. Allocating memory for a large blob object may cause the
system to run out of memory. Before preparing to replace calling of
"get_data()" to unpack large blob objects in latter commits, refactor
"get_data()" to reduce memory footprint for dry_run mode.

Because in dry_run mode, "get_data()" is only used to check the
integrity of data, and the returned buffer is not used at all, we can
allocate a smaller buffer and reuse it as zstream output. Therefore,
in dry_run mode, "get_data()" will release the allocated buffer and
return NULL instead of returning garbage data.

Suggested-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 builtin/unpack-objects.c        | 39 ++++++++++++++++++-------
 t/t5329-unpack-large-objects.sh | 52 +++++++++++++++++++++++++++++++++
 2 files changed, 80 insertions(+), 11 deletions(-)
 create mode 100755 t/t5329-unpack-large-objects.sh

Comments

René Scharfe Jan. 8, 2022, 12:28 p.m. UTC | #1
Am 08.01.22 um 09:54 schrieb Han Xin:
> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> As the name implies, "get_data(size)" will allocate and return a given
> size of memory. Allocating memory for a large blob object may cause the
> system to run out of memory. Before preparing to replace calling of
> "get_data()" to unpack large blob objects in latter commits, refactor
> "get_data()" to reduce memory footprint for dry_run mode.
>
> Because in dry_run mode, "get_data()" is only used to check the
> integrity of data, and the returned buffer is not used at all, we can
> allocate a smaller buffer and reuse it as zstream output. Therefore,
> in dry_run mode, "get_data()" will release the allocated buffer and
> return NULL instead of returning garbage data.
>
> Suggested-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> ---
>  builtin/unpack-objects.c        | 39 ++++++++++++++++++-------
>  t/t5329-unpack-large-objects.sh | 52 +++++++++++++++++++++++++++++++++
>  2 files changed, 80 insertions(+), 11 deletions(-)
>  create mode 100755 t/t5329-unpack-large-objects.sh
>
> diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
> index 4a9466295b..c6d6c17072 100644
> --- a/builtin/unpack-objects.c
> +++ b/builtin/unpack-objects.c
> @@ -96,15 +96,31 @@ static void use(int bytes)
>  	display_throughput(progress, consumed_bytes);
>  }
>
> +/*
> + * Decompress zstream from stdin and return specific size of data.
> + * The caller is responsible to free the returned buffer.
> + *
> + * But for dry_run mode, "get_data()" is only used to check the
> + * integrity of data, and the returned buffer is not used at all.
> + * Therefore, in dry_run mode, "get_data()" will release the small
> + * allocated buffer which is reused to hold temporary zstream output
> + * and return NULL instead of returning garbage data.
> + */
>  static void *get_data(unsigned long size)
>  {
>  	git_zstream stream;
> -	void *buf = xmallocz(size);
> +	unsigned long bufsize;
> +	void *buf;
>
>  	memset(&stream, 0, sizeof(stream));
> +	if (dry_run && size > 8192)
> +		bufsize = 8192;
> +	else
> +		bufsize = size;
> +	buf = xmallocz(bufsize);
>
>  	stream.next_out = buf;
> -	stream.avail_out = size;
> +	stream.avail_out = bufsize;
>  	stream.next_in = fill(1);
>  	stream.avail_in = len;
>  	git_inflate_init(&stream);
> @@ -124,8 +140,15 @@ static void *get_data(unsigned long size)
>  		}
>  		stream.next_in = fill(1);
>  		stream.avail_in = len;
> +		if (dry_run) {
> +			/* reuse the buffer in dry_run mode */
> +			stream.next_out = buf;
> +			stream.avail_out = bufsize;
> +		}
>  	}
>  	git_inflate_end(&stream);
> +	if (dry_run)
> +		FREE_AND_NULL(buf);
>  	return buf;
>  }
>
> @@ -325,10 +348,8 @@ static void unpack_non_delta_entry(enum object_type type, unsigned long size,
>  {
>  	void *buf = get_data(size);
>
> -	if (!dry_run && buf)
> +	if (buf)
>  		write_object(nr, type, buf, size);
> -	else
> -		free(buf);
>  }
>
>  static int resolve_against_held(unsigned nr, const struct object_id *base,
> @@ -358,10 +379,8 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
>  		oidread(&base_oid, fill(the_hash_algo->rawsz));
>  		use(the_hash_algo->rawsz);
>  		delta_data = get_data(delta_size);
> -		if (dry_run || !delta_data) {
> -			free(delta_data);
> +		if (!delta_data)
>  			return;
> -		}
>  		if (has_object_file(&base_oid))
>  			; /* Ok we have this one */
>  		else if (resolve_against_held(nr, &base_oid,
> @@ -397,10 +416,8 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
>  			die("offset value out of bound for delta base object");
>
>  		delta_data = get_data(delta_size);
> -		if (dry_run || !delta_data) {
> -			free(delta_data);
> +		if (!delta_data)
>  			return;
> -		}
>  		lo = 0;
>  		hi = nr;
>  		while (lo < hi) {

Nice!

> diff --git a/t/t5329-unpack-large-objects.sh b/t/t5329-unpack-large-objects.sh
> new file mode 100755
> index 0000000000..39c7a62d94
> --- /dev/null
> +++ b/t/t5329-unpack-large-objects.sh
> @@ -0,0 +1,52 @@
> +#!/bin/sh
> +#
> +# Copyright (c) 2021 Han Xin
> +#
> +
> +test_description='git unpack-objects with large objects'
> +
> +. ./test-lib.sh
> +
> +prepare_dest () {
> +	test_when_finished "rm -rf dest.git" &&
> +	git init --bare dest.git
> +}
> +
> +assert_no_loose () {
> +	glob=dest.git/objects/?? &&
> +	echo "$glob" >expect &&
> +	eval "echo $glob" >actual &&
> +	test_cmp expect actual
> +}
> +
> +assert_no_pack () {
> +	rmdir dest.git/objects/pack

I would expect a function whose name starts with "assert" to have no
side effects.  It doesn't matter here, because it's called only at the
very end, but that might change.  You can use test_dir_is_empty instead
of rmdir.

> +}
> +
> +test_expect_success "create large objects (1.5 MB) and PACK" '
> +	test-tool genrandom foo 1500000 >big-blob &&
> +	test_commit --append foo big-blob &&
> +	test-tool genrandom bar 1500000 >big-blob &&
> +	test_commit --append bar big-blob &&
> +	PACK=$(echo HEAD | git pack-objects --revs test)
> +'
> +
> +test_expect_success 'set memory limitation to 1MB' '
> +	GIT_ALLOC_LIMIT=1m &&
> +	export GIT_ALLOC_LIMIT
> +'
> +
> +test_expect_success 'unpack-objects failed under memory limitation' '
> +	prepare_dest &&
> +	test_must_fail git -C dest.git unpack-objects <test-$PACK.pack 2>err &&
> +	grep "fatal: attempting to allocate" err
> +'
> +
> +test_expect_success 'unpack-objects works with memory limitation in dry-run mode' '
> +	prepare_dest &&
> +	git -C dest.git unpack-objects -n <test-$PACK.pack &&
> +	assert_no_loose &&
> +	assert_no_pack
> +'
> +
> +test_done
Han Xin Jan. 11, 2022, 10:41 a.m. UTC | #2
On Sat, Jan 8, 2022 at 8:28 PM René Scharfe <l.s.r@web.de> wrote:
>
>  Am 08.01.22 um 09:54 schrieb Han Xin:
> > From: Han Xin <hanxin.hx@alibaba-inc.com>
> >
> > +assert_no_loose () {
> > +     glob=dest.git/objects/?? &&
> > +     echo "$glob" >expect &&
> > +     eval "echo $glob" >actual &&
> > +     test_cmp expect actual
> > +}
> > +
> > +assert_no_pack () {
> > +     rmdir dest.git/objects/pack
>
> I would expect a function whose name starts with "assert" to have no
> side effects.  It doesn't matter here, because it's called only at the
> very end, but that might change.  You can use test_dir_is_empty instead
> of rmdir.
>

*nod*
I think it would be better to rename "assert_no_loose()" to "test_no_loose()".
I will remove "assert_no_pack()" and use "test_dir_is_empty()" instead.

Thanks
-Han Xin
diff mbox series

Patch

diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
index 4a9466295b..c6d6c17072 100644
--- a/builtin/unpack-objects.c
+++ b/builtin/unpack-objects.c
@@ -96,15 +96,31 @@  static void use(int bytes)
 	display_throughput(progress, consumed_bytes);
 }
 
+/*
+ * Decompress zstream from stdin and return specific size of data.
+ * The caller is responsible to free the returned buffer.
+ *
+ * But for dry_run mode, "get_data()" is only used to check the
+ * integrity of data, and the returned buffer is not used at all.
+ * Therefore, in dry_run mode, "get_data()" will release the small
+ * allocated buffer which is reused to hold temporary zstream output
+ * and return NULL instead of returning garbage data.
+ */
 static void *get_data(unsigned long size)
 {
 	git_zstream stream;
-	void *buf = xmallocz(size);
+	unsigned long bufsize;
+	void *buf;
 
 	memset(&stream, 0, sizeof(stream));
+	if (dry_run && size > 8192)
+		bufsize = 8192;
+	else
+		bufsize = size;
+	buf = xmallocz(bufsize);
 
 	stream.next_out = buf;
-	stream.avail_out = size;
+	stream.avail_out = bufsize;
 	stream.next_in = fill(1);
 	stream.avail_in = len;
 	git_inflate_init(&stream);
@@ -124,8 +140,15 @@  static void *get_data(unsigned long size)
 		}
 		stream.next_in = fill(1);
 		stream.avail_in = len;
+		if (dry_run) {
+			/* reuse the buffer in dry_run mode */
+			stream.next_out = buf;
+			stream.avail_out = bufsize;
+		}
 	}
 	git_inflate_end(&stream);
+	if (dry_run)
+		FREE_AND_NULL(buf);
 	return buf;
 }
 
@@ -325,10 +348,8 @@  static void unpack_non_delta_entry(enum object_type type, unsigned long size,
 {
 	void *buf = get_data(size);
 
-	if (!dry_run && buf)
+	if (buf)
 		write_object(nr, type, buf, size);
-	else
-		free(buf);
 }
 
 static int resolve_against_held(unsigned nr, const struct object_id *base,
@@ -358,10 +379,8 @@  static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
 		oidread(&base_oid, fill(the_hash_algo->rawsz));
 		use(the_hash_algo->rawsz);
 		delta_data = get_data(delta_size);
-		if (dry_run || !delta_data) {
-			free(delta_data);
+		if (!delta_data)
 			return;
-		}
 		if (has_object_file(&base_oid))
 			; /* Ok we have this one */
 		else if (resolve_against_held(nr, &base_oid,
@@ -397,10 +416,8 @@  static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
 			die("offset value out of bound for delta base object");
 
 		delta_data = get_data(delta_size);
-		if (dry_run || !delta_data) {
-			free(delta_data);
+		if (!delta_data)
 			return;
-		}
 		lo = 0;
 		hi = nr;
 		while (lo < hi) {
diff --git a/t/t5329-unpack-large-objects.sh b/t/t5329-unpack-large-objects.sh
new file mode 100755
index 0000000000..39c7a62d94
--- /dev/null
+++ b/t/t5329-unpack-large-objects.sh
@@ -0,0 +1,52 @@ 
+#!/bin/sh
+#
+# Copyright (c) 2021 Han Xin
+#
+
+test_description='git unpack-objects with large objects'
+
+. ./test-lib.sh
+
+prepare_dest () {
+	test_when_finished "rm -rf dest.git" &&
+	git init --bare dest.git
+}
+
+assert_no_loose () {
+	glob=dest.git/objects/?? &&
+	echo "$glob" >expect &&
+	eval "echo $glob" >actual &&
+	test_cmp expect actual
+}
+
+assert_no_pack () {
+	rmdir dest.git/objects/pack
+}
+
+test_expect_success "create large objects (1.5 MB) and PACK" '
+	test-tool genrandom foo 1500000 >big-blob &&
+	test_commit --append foo big-blob &&
+	test-tool genrandom bar 1500000 >big-blob &&
+	test_commit --append bar big-blob &&
+	PACK=$(echo HEAD | git pack-objects --revs test)
+'
+
+test_expect_success 'set memory limitation to 1MB' '
+	GIT_ALLOC_LIMIT=1m &&
+	export GIT_ALLOC_LIMIT
+'
+
+test_expect_success 'unpack-objects failed under memory limitation' '
+	prepare_dest &&
+	test_must_fail git -C dest.git unpack-objects <test-$PACK.pack 2>err &&
+	grep "fatal: attempting to allocate" err
+'
+
+test_expect_success 'unpack-objects works with memory limitation in dry-run mode' '
+	prepare_dest &&
+	git -C dest.git unpack-objects -n <test-$PACK.pack &&
+	assert_no_loose &&
+	assert_no_pack
+'
+
+test_done