diff mbox series

[v7,5/5] unpack-objects: unpack_non_delta_entry() read data in a stream

Message ID 20211221115201.12120-6-chiyutianyi@gmail.com (mailing list archive)
State Superseded
Headers show
Series unpack large blobs in stream | expand

Commit Message

Han Xin Dec. 21, 2021, 11:52 a.m. UTC
From: Han Xin <hanxin.hx@alibaba-inc.com>

We used to call "get_data()" in "unpack_non_delta_entry()" to read the
entire contents of a blob object, no matter how big it is. This
implementation may consume all the memory and cause OOM.

By implementing a zstream version of input_stream interface, we can use
a small fixed buffer for "unpack_non_delta_entry()".

However, unpack non-delta objects from a stream instead of from an
entrie buffer will have 10% performance penalty. Therefore, only unpack
object larger than the "core.BigFileStreamingThreshold" in zstream. See
the following benchmarks:

    hyperfine \
      --setup \
      'if ! test -d scalar.git; then git clone --bare https://github.com/microsoft/scalar.git; cp scalar.git/objects/pack/*.pack small.pack; fi' \
      --prepare 'rm -rf dest.git && git init --bare dest.git'

    Summary
      './git -C dest.git -c core.bigfilethreshold=512m unpack-objects <small.pack' in 'origin/master'
        1.01 ± 0.04 times faster than './git -C dest.git -c core.bigfilethreshold=512m unpack-objects <small.pack' in 'HEAD~1'
        1.01 ± 0.04 times faster than './git -C dest.git -c core.bigfilethreshold=512m unpack-objects <small.pack' in 'HEAD~0'
        1.03 ± 0.10 times faster than './git -C dest.git -c core.bigfilethreshold=16k unpack-objects <small.pack' in 'origin/master'
        1.02 ± 0.07 times faster than './git -C dest.git -c core.bigfilethreshold=16k unpack-objects <small.pack' in 'HEAD~0'
        1.10 ± 0.04 times faster than './git -C dest.git -c core.bigfilethreshold=16k unpack-objects <small.pack' in 'HEAD~1'

Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Helped-by: Derrick Stolee <stolee@gmail.com>
Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 Documentation/config/core.txt       | 11 +++++
 builtin/unpack-objects.c            | 73 ++++++++++++++++++++++++++++-
 cache.h                             |  1 +
 config.c                            |  5 ++
 environment.c                       |  1 +
 t/t5590-unpack-non-delta-objects.sh | 36 +++++++++++++-
 6 files changed, 125 insertions(+), 2 deletions(-)

Comments

Ævar Arnfjörð Bjarmason Dec. 21, 2021, 3:06 p.m. UTC | #1
On Tue, Dec 21 2021, Han Xin wrote:

> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> We used to call "get_data()" in "unpack_non_delta_entry()" to read the
> entire contents of a blob object, no matter how big it is. This
> implementation may consume all the memory and cause OOM.
>
> By implementing a zstream version of input_stream interface, we can use
> a small fixed buffer for "unpack_non_delta_entry()".
>
> However, unpack non-delta objects from a stream instead of from an
> entrie buffer will have 10% performance penalty. Therefore, only unpack
> object larger than the "core.BigFileStreamingThreshold" in zstream. See
> the following benchmarks:
>
>     hyperfine \
>       --setup \
>       'if ! test -d scalar.git; then git clone --bare https://github.com/microsoft/scalar.git; cp scalar.git/objects/pack/*.pack small.pack; fi' \
>       --prepare 'rm -rf dest.git && git init --bare dest.git'
>
>     Summary
>       './git -C dest.git -c core.bigfilethreshold=512m unpack-objects <small.pack' in 'origin/master'
>         1.01 ± 0.04 times faster than './git -C dest.git -c core.bigfilethreshold=512m unpack-objects <small.pack' in 'HEAD~1'
>         1.01 ± 0.04 times faster than './git -C dest.git -c core.bigfilethreshold=512m unpack-objects <small.pack' in 'HEAD~0'
>         1.03 ± 0.10 times faster than './git -C dest.git -c core.bigfilethreshold=16k unpack-objects <small.pack' in 'origin/master'
>         1.02 ± 0.07 times faster than './git -C dest.git -c core.bigfilethreshold=16k unpack-objects <small.pack' in 'HEAD~0'
>         1.10 ± 0.04 times faster than './git -C dest.git -c core.bigfilethreshold=16k unpack-objects <small.pack' in 'HEAD~1'
>
> Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
> Helped-by: Derrick Stolee <stolee@gmail.com>
> Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> ---
>  Documentation/config/core.txt       | 11 +++++
>  builtin/unpack-objects.c            | 73 ++++++++++++++++++++++++++++-
>  cache.h                             |  1 +
>  config.c                            |  5 ++
>  environment.c                       |  1 +
>  t/t5590-unpack-non-delta-objects.sh | 36 +++++++++++++-
>  6 files changed, 125 insertions(+), 2 deletions(-)
>
> diff --git a/Documentation/config/core.txt b/Documentation/config/core.txt
> index c04f62a54a..601b7a2418 100644
> --- a/Documentation/config/core.txt
> +++ b/Documentation/config/core.txt
> @@ -424,6 +424,17 @@ be delta compressed, but larger binary media files won't be.
>  +
>  Common unit suffixes of 'k', 'm', or 'g' are supported.
>  
> +core.bigFileStreamingThreshold::
> +	Files larger than this will be streamed out to a temporary
> +	object file while being hashed, which will when be renamed
> +	in-place to a loose object, particularly if the
> +	`core.bigFileThreshold' setting dictates that they're always
> +	written out as loose objects.
> ++
> +Default is 128 MiB on all platforms.
> ++
> +Common unit suffixes of 'k', 'm', or 'g' are supported.
> +
>  core.excludesFile::
>  	Specifies the pathname to the file that contains patterns to
>  	describe paths that are not meant to be tracked, in addition
> diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
> index 9104eb48da..72d8616e00 100644
> --- a/builtin/unpack-objects.c
> +++ b/builtin/unpack-objects.c
> @@ -331,11 +331,82 @@ static void added_object(unsigned nr, enum object_type type,
>  	}
>  }
>  
> +struct input_zstream_data {
> +	git_zstream *zstream;
> +	unsigned char buf[8192];
> +	int status;
> +};
> +
> +static const void *feed_input_zstream(struct input_stream *in_stream,
> +				      unsigned long *readlen)
> +{
> +	struct input_zstream_data *data = in_stream->data;
> +	git_zstream *zstream = data->zstream;
> +	void *in = fill(1);
> +
> +	if (!len || data->status == Z_STREAM_END) {
> +		*readlen = 0;
> +		return NULL;
> +	}
> +
> +	zstream->next_out = data->buf;
> +	zstream->avail_out = sizeof(data->buf);
> +	zstream->next_in = in;
> +	zstream->avail_in = len;
> +
> +	data->status = git_inflate(zstream, 0);
> +	use(len - zstream->avail_in);
> +	*readlen = sizeof(data->buf) - zstream->avail_out;
> +
> +	return data->buf;
> +}
> +
> +static void write_stream_blob(unsigned nr, size_t size)
> +{
> +	git_zstream zstream;
> +	struct input_zstream_data data;
> +	struct input_stream in_stream = {
> +		.read = feed_input_zstream,
> +		.data = &data,
> +	};
> +
> +	memset(&zstream, 0, sizeof(zstream));
> +	memset(&data, 0, sizeof(data));

nit/style: both of these memset can be replaced by "{ 0 }", e.g. "git_zstream zstream = { 0 }".

> +	data.zstream = &zstream;
> +	git_inflate_init(&zstream);
> +
> +	if (write_stream_object_file(&in_stream, size, OBJ_BLOB, 0, 0,
> +				     &obj_list[nr].oid))

So at the end of this series we never pass in anything but blob here,
mtime is always 0 etc. So there was no reason to create a factored out
finalize_object_file_with_mtime() earlier in the series.

Well, I don't mind the finalize_object_file_with_mtime() exiting, but
let's not pretend this is more generalized than it is. We're unlikely to
ever want to do this for non-blobs.

This on top of this series (and my local WIP fixups as I'm reviewing
this, so it won't cleanly apply, but the idea should be clear) makes
this simpler:
	
	diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
	index 2f8d34a2e47..a3a1d4b266f 100644
	--- a/builtin/unpack-objects.c
	+++ b/builtin/unpack-objects.c
	@@ -375,8 +375,7 @@ static void write_stream_blob(unsigned nr, size_t size)
	 	data.zstream = &zstream;
	 	git_inflate_init(&zstream);
	 
	-	if (write_stream_object_file(&in_stream, size, OBJ_BLOB, 0, 0,
	-				     &obj_list[nr].oid))
	+	if (write_stream_object_file(&in_stream, size, &obj_list[nr].oid))
	 		die(_("failed to write object in stream"));
	 
	 	if (zstream.total_out != size || data.status != Z_STREAM_END)
	diff --git a/object-file.c b/object-file.c
	index 7fc2363cfa1..0572b34fc5a 100644
	--- a/object-file.c
	+++ b/object-file.c
	@@ -2061,8 +2061,7 @@ static int freshen_packed_object(const struct object_id *oid)
	 }
	 
	 int write_stream_object_file(struct input_stream *in_stream, size_t len,
	-			     enum object_type type, time_t mtime,
	-			     unsigned flags, struct object_id *oid)
	+			     struct object_id *oid)
	 {
	 	int fd, ret, flush = 0;
	 	unsigned char compressed[4096];
	@@ -2081,9 +2080,9 @@ int write_stream_object_file(struct input_stream *in_stream, size_t len,
	 	/* When oid is not determined, save tmp file to odb path. */
	 	strbuf_addf(&filename, "%s/", get_object_directory());
	 
	-	fd = start_loose_object_common(&tmp_file, filename.buf, flags,
	+	fd = start_loose_object_common(&tmp_file, filename.buf, 0,
	 				       &stream, compressed, sizeof(compressed),
	-				       &c, type, len, hdr, &hdrlen);
	+				       &c, OBJ_BLOB, len, hdr, &hdrlen);
	 	if (fd < 0)
	 		return -1;
	 
	@@ -2135,7 +2134,7 @@ int write_stream_object_file(struct input_stream *in_stream, size_t len,
	 		strbuf_release(&dir);
	 	}
	 
	-	return finalize_object_file_with_mtime(tmp_file.buf, filename.buf, mtime, flags);
	+	return finalize_object_file(tmp_file.buf, filename.buf);
	 }
	 
	 int write_object_file_flags(const void *buf, unsigned long len,
	diff --git a/object-store.h b/object-store.h
	index 87d370d39ca..1362b58a4d3 100644
	--- a/object-store.h
	+++ b/object-store.h
	@@ -257,8 +257,7 @@ int hash_write_object_file_literally(const void *buf, unsigned long len,
	 				     unsigned flags);
	 
	 int write_stream_object_file(struct input_stream *in_stream, size_t len,
	-			     enum object_type type, time_t mtime,
	-			     unsigned flags, struct object_id *oid);
	+			     struct object_id *oid);
	 
	 /*
	  * Add an object file to the in-memory object store, without writing it
	

> +		die(_("failed to write object in stream"));
> diff --git a/environment.c b/environment.c
> index 0d06a31024..04bba593de 100644
> --- a/environment.c
> +++ b/environment.c
> @@ -47,6 +47,7 @@ size_t packed_git_window_size = DEFAULT_PACKED_GIT_WINDOW_SIZE;
>  size_t packed_git_limit = DEFAULT_PACKED_GIT_LIMIT;
>  size_t delta_base_cache_limit = 96 * 1024 * 1024;
>  unsigned long big_file_threshold = 512 * 1024 * 1024;
> +unsigned long big_file_streaming_threshold = 128 * 1024 * 1024;
>  int pager_use_color = 1;
>  const char *editor_program;
>  const char *askpass_program;
> diff --git a/t/t5590-unpack-non-delta-objects.sh b/t/t5590-unpack-non-delta-objects.sh
> index 48c4fb1ba3..8436cbf8db 100755
> --- a/t/t5590-unpack-non-delta-objects.sh
> +++ b/t/t5590-unpack-non-delta-objects.sh
> @@ -13,6 +13,11 @@ export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
>  prepare_dest () {
>  	test_when_finished "rm -rf dest.git" &&
>  	git init --bare dest.git
> +	if test -n "$1"
> +	then
> +		git -C dest.git config core.bigFileStreamingThreshold $1
> +		git -C dest.git config core.bigFileThreshold $1
> +	fi

All of this new code is missing "&&" to chain & test forfailures.
Jiang Xin Dec. 31, 2021, 3:19 a.m. UTC | #2
On Wed, Dec 22, 2021 at 2:56 AM Han Xin <chiyutianyi@gmail.com> wrote:
>
> From: Han Xin <hanxin.hx@alibaba-inc.com>
>
> We used to call "get_data()" in "unpack_non_delta_entry()" to read the
> entire contents of a blob object, no matter how big it is. This
> implementation may consume all the memory and cause OOM.
>
> By implementing a zstream version of input_stream interface, we can use
> a small fixed buffer for "unpack_non_delta_entry()".
>
> However, unpack non-delta objects from a stream instead of from an
> entrie buffer will have 10% performance penalty. Therefore, only unpack
> object larger than the "core.BigFileStreamingThreshold" in zstream. See
> the following benchmarks:
>
>     hyperfine \
>       --setup \
>       'if ! test -d scalar.git; then git clone --bare https://github.com/microsoft/scalar.git; cp scalar.git/objects/pack/*.pack small.pack; fi' \
>       --prepare 'rm -rf dest.git && git init --bare dest.git'
>
>     Summary
>       './git -C dest.git -c core.bigfilethreshold=512m unpack-objects <small.pack' in 'origin/master'
>         1.01 ± 0.04 times faster than './git -C dest.git -c core.bigfilethreshold=512m unpack-objects <small.pack' in 'HEAD~1'
>         1.01 ± 0.04 times faster than './git -C dest.git -c core.bigfilethreshold=512m unpack-objects <small.pack' in 'HEAD~0'
>         1.03 ± 0.10 times faster than './git -C dest.git -c core.bigfilethreshold=16k unpack-objects <small.pack' in 'origin/master'
>         1.02 ± 0.07 times faster than './git -C dest.git -c core.bigfilethreshold=16k unpack-objects <small.pack' in 'HEAD~0'
>         1.10 ± 0.04 times faster than './git -C dest.git -c core.bigfilethreshold=16k unpack-objects <small.pack' in 'HEAD~1'
>
> Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
> Helped-by: Derrick Stolee <stolee@gmail.com>
> Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com>
> Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
> ---
>  Documentation/config/core.txt       | 11 +++++
>  builtin/unpack-objects.c            | 73 ++++++++++++++++++++++++++++-
>  cache.h                             |  1 +
>  config.c                            |  5 ++
>  environment.c                       |  1 +
>  t/t5590-unpack-non-delta-objects.sh | 36 +++++++++++++-
>  6 files changed, 125 insertions(+), 2 deletions(-)
>
> diff --git a/Documentation/config/core.txt b/Documentation/config/core.txt
> index c04f62a54a..601b7a2418 100644
> --- a/Documentation/config/core.txt
> +++ b/Documentation/config/core.txt
> @@ -424,6 +424,17 @@ be delta compressed, but larger binary media files won't be.
>  +
>  Common unit suffixes of 'k', 'm', or 'g' are supported.
>
> +core.bigFileStreamingThreshold::
> +       Files larger than this will be streamed out to a temporary
> +       object file while being hashed, which will when be renamed
> +       in-place to a loose object, particularly if the
> +       `core.bigFileThreshold' setting dictates that they're always
> +       written out as loose objects.

Han Xin told me the reason to introduce another git config variable,
but I feel it not good to introduce an application specific config
variable as "core.XXX" and parsing it in "config.c".

So in patch v8, will still reuse the config variable
"core.bigFileThreshold", and will introduce an application specific
config variable, such as unpack.bigFileThreshold and parse the new
config in "builtin/unpack-objects.c".

--
Jiang Xin
diff mbox series

Patch

diff --git a/Documentation/config/core.txt b/Documentation/config/core.txt
index c04f62a54a..601b7a2418 100644
--- a/Documentation/config/core.txt
+++ b/Documentation/config/core.txt
@@ -424,6 +424,17 @@  be delta compressed, but larger binary media files won't be.
 +
 Common unit suffixes of 'k', 'm', or 'g' are supported.
 
+core.bigFileStreamingThreshold::
+	Files larger than this will be streamed out to a temporary
+	object file while being hashed, which will when be renamed
+	in-place to a loose object, particularly if the
+	`core.bigFileThreshold' setting dictates that they're always
+	written out as loose objects.
++
+Default is 128 MiB on all platforms.
++
+Common unit suffixes of 'k', 'm', or 'g' are supported.
+
 core.excludesFile::
 	Specifies the pathname to the file that contains patterns to
 	describe paths that are not meant to be tracked, in addition
diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c
index 9104eb48da..72d8616e00 100644
--- a/builtin/unpack-objects.c
+++ b/builtin/unpack-objects.c
@@ -331,11 +331,82 @@  static void added_object(unsigned nr, enum object_type type,
 	}
 }
 
+struct input_zstream_data {
+	git_zstream *zstream;
+	unsigned char buf[8192];
+	int status;
+};
+
+static const void *feed_input_zstream(struct input_stream *in_stream,
+				      unsigned long *readlen)
+{
+	struct input_zstream_data *data = in_stream->data;
+	git_zstream *zstream = data->zstream;
+	void *in = fill(1);
+
+	if (!len || data->status == Z_STREAM_END) {
+		*readlen = 0;
+		return NULL;
+	}
+
+	zstream->next_out = data->buf;
+	zstream->avail_out = sizeof(data->buf);
+	zstream->next_in = in;
+	zstream->avail_in = len;
+
+	data->status = git_inflate(zstream, 0);
+	use(len - zstream->avail_in);
+	*readlen = sizeof(data->buf) - zstream->avail_out;
+
+	return data->buf;
+}
+
+static void write_stream_blob(unsigned nr, size_t size)
+{
+	git_zstream zstream;
+	struct input_zstream_data data;
+	struct input_stream in_stream = {
+		.read = feed_input_zstream,
+		.data = &data,
+	};
+
+	memset(&zstream, 0, sizeof(zstream));
+	memset(&data, 0, sizeof(data));
+	data.zstream = &zstream;
+	git_inflate_init(&zstream);
+
+	if (write_stream_object_file(&in_stream, size, OBJ_BLOB, 0, 0,
+				     &obj_list[nr].oid))
+		die(_("failed to write object in stream"));
+
+	if (zstream.total_out != size || data.status != Z_STREAM_END)
+		die(_("inflate returned %d"), data.status);
+	git_inflate_end(&zstream);
+
+	if (strict) {
+		struct blob *blob =
+			lookup_blob(the_repository, &obj_list[nr].oid);
+		if (blob)
+			blob->object.flags |= FLAG_WRITTEN;
+		else
+			die(_("invalid blob object from stream"));
+	}
+	obj_list[nr].obj = NULL;
+}
+
 static void unpack_non_delta_entry(enum object_type type, unsigned long size,
 				   unsigned nr)
 {
-	void *buf = get_data(size, dry_run);
+	void *buf;
+
+	/* Write large blob in stream without allocating full buffer. */
+	if (!dry_run && type == OBJ_BLOB &&
+	    size > big_file_streaming_threshold) {
+		write_stream_blob(nr, size);
+		return;
+	}
 
+	buf = get_data(size, dry_run);
 	if (!dry_run && buf)
 		write_object(nr, type, buf, size);
 	else
diff --git a/cache.h b/cache.h
index 64071a8d80..8c9123cb5d 100644
--- a/cache.h
+++ b/cache.h
@@ -974,6 +974,7 @@  extern size_t packed_git_window_size;
 extern size_t packed_git_limit;
 extern size_t delta_base_cache_limit;
 extern unsigned long big_file_threshold;
+extern unsigned long big_file_streaming_threshold;
 extern unsigned long pack_size_limit_cfg;
 
 /*
diff --git a/config.c b/config.c
index c5873f3a70..7b122a142a 100644
--- a/config.c
+++ b/config.c
@@ -1408,6 +1408,11 @@  static int git_default_core_config(const char *var, const char *value, void *cb)
 		return 0;
 	}
 
+	if (!strcmp(var, "core.bigfilestreamingthreshold")) {
+		big_file_streaming_threshold = git_config_ulong(var, value);
+		return 0;
+	}
+
 	if (!strcmp(var, "core.packedgitlimit")) {
 		packed_git_limit = git_config_ulong(var, value);
 		return 0;
diff --git a/environment.c b/environment.c
index 0d06a31024..04bba593de 100644
--- a/environment.c
+++ b/environment.c
@@ -47,6 +47,7 @@  size_t packed_git_window_size = DEFAULT_PACKED_GIT_WINDOW_SIZE;
 size_t packed_git_limit = DEFAULT_PACKED_GIT_LIMIT;
 size_t delta_base_cache_limit = 96 * 1024 * 1024;
 unsigned long big_file_threshold = 512 * 1024 * 1024;
+unsigned long big_file_streaming_threshold = 128 * 1024 * 1024;
 int pager_use_color = 1;
 const char *editor_program;
 const char *askpass_program;
diff --git a/t/t5590-unpack-non-delta-objects.sh b/t/t5590-unpack-non-delta-objects.sh
index 48c4fb1ba3..8436cbf8db 100755
--- a/t/t5590-unpack-non-delta-objects.sh
+++ b/t/t5590-unpack-non-delta-objects.sh
@@ -13,6 +13,11 @@  export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
 prepare_dest () {
 	test_when_finished "rm -rf dest.git" &&
 	git init --bare dest.git
+	if test -n "$1"
+	then
+		git -C dest.git config core.bigFileStreamingThreshold $1
+		git -C dest.git config core.bigFileThreshold $1
+	fi
 }
 
 test_expect_success "setup repo with big blobs (1.5 MB)" '
@@ -33,7 +38,7 @@  test_expect_success 'setup env: GIT_ALLOC_LIMIT to 1MB' '
 '
 
 test_expect_success 'fail to unpack-objects: cannot allocate' '
-	prepare_dest &&
+	prepare_dest 2m &&
 	test_must_fail git -C dest.git unpack-objects <test-$PACK.pack 2>err &&
 	grep "fatal: attempting to allocate" err &&
 	(
@@ -44,6 +49,35 @@  test_expect_success 'fail to unpack-objects: cannot allocate' '
 	! test_cmp expect actual
 '
 
+test_expect_success 'unpack big object in stream' '
+	prepare_dest 1m &&
+	mkdir -p dest.git/objects/05 &&
+	git -C dest.git unpack-objects <test-$PACK.pack &&
+	git -C dest.git fsck &&
+	(
+		cd dest.git &&
+		find objects/?? -type f | sort
+	) >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'unpack big object in stream with existing oids' '
+	prepare_dest 1m &&
+	git -C dest.git index-pack --stdin <test-$PACK.pack &&
+	(
+		cd dest.git &&
+		find objects/?? -type f | sort
+	) >actual &&
+	test_must_be_empty actual &&
+	git -C dest.git unpack-objects <test-$PACK.pack &&
+	git -C dest.git fsck &&
+	(
+		cd dest.git &&
+		find objects/?? -type f | sort
+	) >actual &&
+	test_must_be_empty actual
+'
+
 test_expect_success 'unpack-objects dry-run' '
 	prepare_dest &&
 	git -C dest.git unpack-objects -n <test-$PACK.pack &&