Message ID | 20220108085419.79682-2-chiyutianyi@gmail.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | unpack large blobs in stream | expand |
Am 08.01.22 um 09:54 schrieb Han Xin: > From: Han Xin <hanxin.hx@alibaba-inc.com> > > As the name implies, "get_data(size)" will allocate and return a given > size of memory. Allocating memory for a large blob object may cause the > system to run out of memory. Before preparing to replace calling of > "get_data()" to unpack large blob objects in latter commits, refactor > "get_data()" to reduce memory footprint for dry_run mode. > > Because in dry_run mode, "get_data()" is only used to check the > integrity of data, and the returned buffer is not used at all, we can > allocate a smaller buffer and reuse it as zstream output. Therefore, > in dry_run mode, "get_data()" will release the allocated buffer and > return NULL instead of returning garbage data. > > Suggested-by: Jiang Xin <zhiyou.jx@alibaba-inc.com> > Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com> > --- > builtin/unpack-objects.c | 39 ++++++++++++++++++------- > t/t5329-unpack-large-objects.sh | 52 +++++++++++++++++++++++++++++++++ > 2 files changed, 80 insertions(+), 11 deletions(-) > create mode 100755 t/t5329-unpack-large-objects.sh > > diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c > index 4a9466295b..c6d6c17072 100644 > --- a/builtin/unpack-objects.c > +++ b/builtin/unpack-objects.c > @@ -96,15 +96,31 @@ static void use(int bytes) > display_throughput(progress, consumed_bytes); > } > > +/* > + * Decompress zstream from stdin and return specific size of data. > + * The caller is responsible to free the returned buffer. > + * > + * But for dry_run mode, "get_data()" is only used to check the > + * integrity of data, and the returned buffer is not used at all. > + * Therefore, in dry_run mode, "get_data()" will release the small > + * allocated buffer which is reused to hold temporary zstream output > + * and return NULL instead of returning garbage data. > + */ > static void *get_data(unsigned long size) > { > git_zstream stream; > - void *buf = xmallocz(size); > + unsigned long bufsize; > + void *buf; > > memset(&stream, 0, sizeof(stream)); > + if (dry_run && size > 8192) > + bufsize = 8192; > + else > + bufsize = size; > + buf = xmallocz(bufsize); > > stream.next_out = buf; > - stream.avail_out = size; > + stream.avail_out = bufsize; > stream.next_in = fill(1); > stream.avail_in = len; > git_inflate_init(&stream); > @@ -124,8 +140,15 @@ static void *get_data(unsigned long size) > } > stream.next_in = fill(1); > stream.avail_in = len; > + if (dry_run) { > + /* reuse the buffer in dry_run mode */ > + stream.next_out = buf; > + stream.avail_out = bufsize; > + } > } > git_inflate_end(&stream); > + if (dry_run) > + FREE_AND_NULL(buf); > return buf; > } > > @@ -325,10 +348,8 @@ static void unpack_non_delta_entry(enum object_type type, unsigned long size, > { > void *buf = get_data(size); > > - if (!dry_run && buf) > + if (buf) > write_object(nr, type, buf, size); > - else > - free(buf); > } > > static int resolve_against_held(unsigned nr, const struct object_id *base, > @@ -358,10 +379,8 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size, > oidread(&base_oid, fill(the_hash_algo->rawsz)); > use(the_hash_algo->rawsz); > delta_data = get_data(delta_size); > - if (dry_run || !delta_data) { > - free(delta_data); > + if (!delta_data) > return; > - } > if (has_object_file(&base_oid)) > ; /* Ok we have this one */ > else if (resolve_against_held(nr, &base_oid, > @@ -397,10 +416,8 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size, > die("offset value out of bound for delta base object"); > > delta_data = get_data(delta_size); > - if (dry_run || !delta_data) { > - free(delta_data); > + if (!delta_data) > return; > - } > lo = 0; > hi = nr; > while (lo < hi) { Nice! > diff --git a/t/t5329-unpack-large-objects.sh b/t/t5329-unpack-large-objects.sh > new file mode 100755 > index 0000000000..39c7a62d94 > --- /dev/null > +++ b/t/t5329-unpack-large-objects.sh > @@ -0,0 +1,52 @@ > +#!/bin/sh > +# > +# Copyright (c) 2021 Han Xin > +# > + > +test_description='git unpack-objects with large objects' > + > +. ./test-lib.sh > + > +prepare_dest () { > + test_when_finished "rm -rf dest.git" && > + git init --bare dest.git > +} > + > +assert_no_loose () { > + glob=dest.git/objects/?? && > + echo "$glob" >expect && > + eval "echo $glob" >actual && > + test_cmp expect actual > +} > + > +assert_no_pack () { > + rmdir dest.git/objects/pack I would expect a function whose name starts with "assert" to have no side effects. It doesn't matter here, because it's called only at the very end, but that might change. You can use test_dir_is_empty instead of rmdir. > +} > + > +test_expect_success "create large objects (1.5 MB) and PACK" ' > + test-tool genrandom foo 1500000 >big-blob && > + test_commit --append foo big-blob && > + test-tool genrandom bar 1500000 >big-blob && > + test_commit --append bar big-blob && > + PACK=$(echo HEAD | git pack-objects --revs test) > +' > + > +test_expect_success 'set memory limitation to 1MB' ' > + GIT_ALLOC_LIMIT=1m && > + export GIT_ALLOC_LIMIT > +' > + > +test_expect_success 'unpack-objects failed under memory limitation' ' > + prepare_dest && > + test_must_fail git -C dest.git unpack-objects <test-$PACK.pack 2>err && > + grep "fatal: attempting to allocate" err > +' > + > +test_expect_success 'unpack-objects works with memory limitation in dry-run mode' ' > + prepare_dest && > + git -C dest.git unpack-objects -n <test-$PACK.pack && > + assert_no_loose && > + assert_no_pack > +' > + > +test_done
On Sat, Jan 8, 2022 at 8:28 PM René Scharfe <l.s.r@web.de> wrote: > > Am 08.01.22 um 09:54 schrieb Han Xin: > > From: Han Xin <hanxin.hx@alibaba-inc.com> > > > > +assert_no_loose () { > > + glob=dest.git/objects/?? && > > + echo "$glob" >expect && > > + eval "echo $glob" >actual && > > + test_cmp expect actual > > +} > > + > > +assert_no_pack () { > > + rmdir dest.git/objects/pack > > I would expect a function whose name starts with "assert" to have no > side effects. It doesn't matter here, because it's called only at the > very end, but that might change. You can use test_dir_is_empty instead > of rmdir. > *nod* I think it would be better to rename "assert_no_loose()" to "test_no_loose()". I will remove "assert_no_pack()" and use "test_dir_is_empty()" instead. Thanks -Han Xin
diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c index 4a9466295b..c6d6c17072 100644 --- a/builtin/unpack-objects.c +++ b/builtin/unpack-objects.c @@ -96,15 +96,31 @@ static void use(int bytes) display_throughput(progress, consumed_bytes); } +/* + * Decompress zstream from stdin and return specific size of data. + * The caller is responsible to free the returned buffer. + * + * But for dry_run mode, "get_data()" is only used to check the + * integrity of data, and the returned buffer is not used at all. + * Therefore, in dry_run mode, "get_data()" will release the small + * allocated buffer which is reused to hold temporary zstream output + * and return NULL instead of returning garbage data. + */ static void *get_data(unsigned long size) { git_zstream stream; - void *buf = xmallocz(size); + unsigned long bufsize; + void *buf; memset(&stream, 0, sizeof(stream)); + if (dry_run && size > 8192) + bufsize = 8192; + else + bufsize = size; + buf = xmallocz(bufsize); stream.next_out = buf; - stream.avail_out = size; + stream.avail_out = bufsize; stream.next_in = fill(1); stream.avail_in = len; git_inflate_init(&stream); @@ -124,8 +140,15 @@ static void *get_data(unsigned long size) } stream.next_in = fill(1); stream.avail_in = len; + if (dry_run) { + /* reuse the buffer in dry_run mode */ + stream.next_out = buf; + stream.avail_out = bufsize; + } } git_inflate_end(&stream); + if (dry_run) + FREE_AND_NULL(buf); return buf; } @@ -325,10 +348,8 @@ static void unpack_non_delta_entry(enum object_type type, unsigned long size, { void *buf = get_data(size); - if (!dry_run && buf) + if (buf) write_object(nr, type, buf, size); - else - free(buf); } static int resolve_against_held(unsigned nr, const struct object_id *base, @@ -358,10 +379,8 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size, oidread(&base_oid, fill(the_hash_algo->rawsz)); use(the_hash_algo->rawsz); delta_data = get_data(delta_size); - if (dry_run || !delta_data) { - free(delta_data); + if (!delta_data) return; - } if (has_object_file(&base_oid)) ; /* Ok we have this one */ else if (resolve_against_held(nr, &base_oid, @@ -397,10 +416,8 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size, die("offset value out of bound for delta base object"); delta_data = get_data(delta_size); - if (dry_run || !delta_data) { - free(delta_data); + if (!delta_data) return; - } lo = 0; hi = nr; while (lo < hi) { diff --git a/t/t5329-unpack-large-objects.sh b/t/t5329-unpack-large-objects.sh new file mode 100755 index 0000000000..39c7a62d94 --- /dev/null +++ b/t/t5329-unpack-large-objects.sh @@ -0,0 +1,52 @@ +#!/bin/sh +# +# Copyright (c) 2021 Han Xin +# + +test_description='git unpack-objects with large objects' + +. ./test-lib.sh + +prepare_dest () { + test_when_finished "rm -rf dest.git" && + git init --bare dest.git +} + +assert_no_loose () { + glob=dest.git/objects/?? && + echo "$glob" >expect && + eval "echo $glob" >actual && + test_cmp expect actual +} + +assert_no_pack () { + rmdir dest.git/objects/pack +} + +test_expect_success "create large objects (1.5 MB) and PACK" ' + test-tool genrandom foo 1500000 >big-blob && + test_commit --append foo big-blob && + test-tool genrandom bar 1500000 >big-blob && + test_commit --append bar big-blob && + PACK=$(echo HEAD | git pack-objects --revs test) +' + +test_expect_success 'set memory limitation to 1MB' ' + GIT_ALLOC_LIMIT=1m && + export GIT_ALLOC_LIMIT +' + +test_expect_success 'unpack-objects failed under memory limitation' ' + prepare_dest && + test_must_fail git -C dest.git unpack-objects <test-$PACK.pack 2>err && + grep "fatal: attempting to allocate" err +' + +test_expect_success 'unpack-objects works with memory limitation in dry-run mode' ' + prepare_dest && + git -C dest.git unpack-objects -n <test-$PACK.pack && + assert_no_loose && + assert_no_pack +' + +test_done