Message ID | 20211203093530.93589-2-chiyutianyi@gmail.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | unpack large objects in stream | expand |
On Fri, Dec 03 2021, Han Xin wrote: > From: Han Xin <hanxin.hx@alibaba-inc.com> > > We used to call "get_data()" in "unpack_non_delta_entry()" to read the > entire contents of a blob object, no matter how big it is. This > implementation may consume all the memory and cause OOM. > > This can be improved by feeding data to "write_loose_object()" in a > stream. The input stream is implemented as an interface. In the first > step, we make a simple implementation, feeding the entire buffer in the > "stream" to "write_loose_object()" as a refactor. > > Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com> > Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com> > --- > object-file.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++---- > object-store.h | 6 ++++++ > 2 files changed, 55 insertions(+), 4 deletions(-) > > diff --git a/object-file.c b/object-file.c > index eb972cdccd..82656f7428 100644 > --- a/object-file.c > +++ b/object-file.c > @@ -1860,8 +1860,26 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename) > return fd; > } > > +struct simple_input_stream_data { > + const void *buf; > + unsigned long len; > +}; I see why you picked "const void *buf" here, over say const char *, it's what "struct input_stream" uses. But why not use size_t for the length, as input_stream does? > +static const void *feed_simple_input_stream(struct input_stream *in_stream, unsigned long *len) > +{ > + struct simple_input_stream_data *data = in_stream->data; > + > + if (data->len == 0) { nit: if (!data->len)... > + *len = 0; > + return NULL; > + } > + *len = data->len; > + data->len = 0; > + return data->buf; But isn't the body of this functin the same as: *len = data->len; if (!len) return NULL; data->len = 0; return data->buf; I.e. you don't need the condition for setting "*len" if it's 0, then data->len is also 0. You just want to return NULL afterwards, and not set (harmless, but no need) data->len to 0)< or return data->buf. > + struct input_stream in_stream = { > + .read = feed_simple_input_stream, > + .data = (void *)&(struct simple_input_stream_data) { > + .buf = buf, > + .len = len, > + }, > + .size = len, > + }; Maybe it's that I'm unused to it, but I find this a bit more readable: @@ -2013,12 +2011,13 @@ int write_object_file_flags(const void *buf, unsigned long len, { char hdr[MAX_HEADER_LEN]; int hdrlen = sizeof(hdr); + struct simple_input_stream_data tmp = { + .buf = buf, + .len = len, + }; struct input_stream in_stream = { .read = feed_simple_input_stream, - .data = (void *)&(struct simple_input_stream_data) { - .buf = buf, - .len = len, - }, + .data = (void *)&tmp, .size = len, }; Yes there's a temporary variable, but no denser inline casting. Also easier to strep through in a debugger (which will have the type information on "tmp". > int hash_object_file_literally(const void *buf, unsigned long len, > @@ -1977,6 +2006,14 @@ int hash_object_file_literally(const void *buf, unsigned long len, > { > char *header; > int hdrlen, status = 0; > + struct input_stream in_stream = { > + .read = feed_simple_input_stream, > + .data = (void *)&(struct simple_input_stream_data) { > + .buf = buf, > + .len = len, > + }, > + .size = len, > + }; ditto.. > /* type string, SP, %lu of the length plus NUL must fit this */ > hdrlen = strlen(type) + MAX_HEADER_LEN; > @@ -1988,7 +2025,7 @@ int hash_object_file_literally(const void *buf, unsigned long len, > goto cleanup; > if (freshen_packed_object(oid) || freshen_loose_object(oid)) > goto cleanup; > - status = write_loose_object(oid, header, hdrlen, buf, len, 0, 0); > + status = write_loose_object(oid, header, hdrlen, &in_stream, 0, 0); > > cleanup: > free(header); > @@ -2003,14 +2040,22 @@ int force_object_loose(const struct object_id *oid, time_t mtime) > char hdr[MAX_HEADER_LEN]; > int hdrlen; > int ret; > + struct simple_input_stream_data data; > + struct input_stream in_stream = { > + .read = feed_simple_input_stream, > + .data = &data, > + }; > > if (has_loose_object(oid)) > return 0; > buf = read_object(the_repository, oid, &type, &len); > + in_stream.size = len; Why are we setting this here?... > if (!buf) > return error(_("cannot read object for %s"), oid_to_hex(oid)); ...Insted of after this point, as we may error and never use it? > + data.buf = buf; > + data.len = len; Probably won't matter, just a nit... > +struct input_stream { > + const void *(*read)(struct input_stream *, unsigned long *len); > + void *data; > + size_t size; > +}; > + Ah, and here's the size_t... :)
On Fri, Dec 3, 2021 at 9:41 PM Ævar Arnfjörð Bjarmason <avarab@gmail.com> wrote: > > > On Fri, Dec 03 2021, Han Xin wrote: > > > From: Han Xin <hanxin.hx@alibaba-inc.com> > > > > We used to call "get_data()" in "unpack_non_delta_entry()" to read the > > entire contents of a blob object, no matter how big it is. This > > implementation may consume all the memory and cause OOM. > > > > This can be improved by feeding data to "write_loose_object()" in a > > stream. The input stream is implemented as an interface. In the first > > step, we make a simple implementation, feeding the entire buffer in the > > "stream" to "write_loose_object()" as a refactor. > > > > Helped-by: Jiang Xin <zhiyou.jx@alibaba-inc.com> > > Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com> > > --- > > object-file.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++---- > > object-store.h | 6 ++++++ > > 2 files changed, 55 insertions(+), 4 deletions(-) > > > > diff --git a/object-file.c b/object-file.c > > index eb972cdccd..82656f7428 100644 > > --- a/object-file.c > > +++ b/object-file.c > > @@ -1860,8 +1860,26 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename) > > return fd; > > } > > > > +struct simple_input_stream_data { > > + const void *buf; > > + unsigned long len; > > +}; > > I see why you picked "const void *buf" here, over say const char *, it's > what "struct input_stream" uses. > > But why not use size_t for the length, as input_stream does? > Yes, "size_t" will be better here. > > +static const void *feed_simple_input_stream(struct input_stream *in_stream, unsigned long *len) > > +{ > > + struct simple_input_stream_data *data = in_stream->data; > > + > > + if (data->len == 0) { > > nit: if (!data->len)... > Will apply. > > + *len = 0; > > + return NULL; > > + } > > + *len = data->len; > > + data->len = 0; > > + return data->buf; > > But isn't the body of this functin the same as: > > *len = data->len; > if (!len) > return NULL; > data->len = 0; > return data->buf; > > I.e. you don't need the condition for setting "*len" if it's 0, then > data->len is also 0. You just want to return NULL afterwards, and not > set (harmless, but no need) data->len to 0)< or return data->buf. Will apply. > > + struct input_stream in_stream = { > > + .read = feed_simple_input_stream, > > + .data = (void *)&(struct simple_input_stream_data) { > > + .buf = buf, > > + .len = len, > > + }, > > + .size = len, > > + }; > > Maybe it's that I'm unused to it, but I find this a bit more readable: > > @@ -2013,12 +2011,13 @@ int write_object_file_flags(const void *buf, unsigned long len, > { > char hdr[MAX_HEADER_LEN]; > int hdrlen = sizeof(hdr); > + struct simple_input_stream_data tmp = { > + .buf = buf, > + .len = len, > + }; > struct input_stream in_stream = { > .read = feed_simple_input_stream, > - .data = (void *)&(struct simple_input_stream_data) { > - .buf = buf, > - .len = len, > - }, > + .data = (void *)&tmp, > .size = len, > }; > > Yes there's a temporary variable, but no denser inline casting. Also > easier to strep through in a debugger (which will have the type > information on "tmp". > Will apply. > > int hash_object_file_literally(const void *buf, unsigned long len, > > @@ -1977,6 +2006,14 @@ int hash_object_file_literally(const void *buf, unsigned long len, > > { > > char *header; > > int hdrlen, status = 0; > > + struct input_stream in_stream = { > > + .read = feed_simple_input_stream, > > + .data = (void *)&(struct simple_input_stream_data) { > > + .buf = buf, > > + .len = len, > > + }, > > + .size = len, > > + }; > > ditto.. > > > /* type string, SP, %lu of the length plus NUL must fit this */ > > hdrlen = strlen(type) + MAX_HEADER_LEN; > > @@ -1988,7 +2025,7 @@ int hash_object_file_literally(const void *buf, unsigned long len, > > goto cleanup; > > if (freshen_packed_object(oid) || freshen_loose_object(oid)) > > goto cleanup; > > - status = write_loose_object(oid, header, hdrlen, buf, len, 0, 0); > > + status = write_loose_object(oid, header, hdrlen, &in_stream, 0, 0); > > > > cleanup: > > free(header); > > @@ -2003,14 +2040,22 @@ int force_object_loose(const struct object_id *oid, time_t mtime) > > char hdr[MAX_HEADER_LEN]; > > int hdrlen; > > int ret; > > + struct simple_input_stream_data data; > > + struct input_stream in_stream = { > > + .read = feed_simple_input_stream, > > + .data = &data, > > + }; > > > > if (has_loose_object(oid)) > > return 0; > > buf = read_object(the_repository, oid, &type, &len); > > + in_stream.size = len; > > Why are we setting this here?... > Yes, putting "in_stream.size=len;" here was a stupid decision. > > if (!buf) > > return error(_("cannot read object for %s"), oid_to_hex(oid)); > > ...Insted of after this point, as we may error and never use it? > > > + data.buf = buf; > > + data.len = len; > > Probably won't matter, just a nit... > > > +struct input_stream { > > + const void *(*read)(struct input_stream *, unsigned long *len); > > + void *data; > > + size_t size; > > +}; > > + > > Ah, and here's the size_t... :)
diff --git a/object-file.c b/object-file.c index eb972cdccd..82656f7428 100644 --- a/object-file.c +++ b/object-file.c @@ -1860,8 +1860,26 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename) return fd; } +struct simple_input_stream_data { + const void *buf; + unsigned long len; +}; + +static const void *feed_simple_input_stream(struct input_stream *in_stream, unsigned long *len) +{ + struct simple_input_stream_data *data = in_stream->data; + + if (data->len == 0) { + *len = 0; + return NULL; + } + *len = data->len; + data->len = 0; + return data->buf; +} + static int write_loose_object(const struct object_id *oid, char *hdr, - int hdrlen, const void *buf, unsigned long len, + int hdrlen, struct input_stream *in_stream, time_t mtime, unsigned flags) { int fd, ret; @@ -1871,6 +1889,8 @@ static int write_loose_object(const struct object_id *oid, char *hdr, struct object_id parano_oid; static struct strbuf tmp_file = STRBUF_INIT; static struct strbuf filename = STRBUF_INIT; + const void *buf; + unsigned long len; loose_object_path(the_repository, &filename, oid); @@ -1898,6 +1918,7 @@ static int write_loose_object(const struct object_id *oid, char *hdr, the_hash_algo->update_fn(&c, hdr, hdrlen); /* Then the data itself.. */ + buf = in_stream->read(in_stream, &len); stream.next_in = (void *)buf; stream.avail_in = len; do { @@ -1960,6 +1981,14 @@ int write_object_file_flags(const void *buf, unsigned long len, { char hdr[MAX_HEADER_LEN]; int hdrlen = sizeof(hdr); + struct input_stream in_stream = { + .read = feed_simple_input_stream, + .data = (void *)&(struct simple_input_stream_data) { + .buf = buf, + .len = len, + }, + .size = len, + }; /* Normally if we have it in the pack then we do not bother writing * it out into .git/objects/??/?{38} file. @@ -1968,7 +1997,7 @@ int write_object_file_flags(const void *buf, unsigned long len, &hdrlen); if (freshen_packed_object(oid) || freshen_loose_object(oid)) return 0; - return write_loose_object(oid, hdr, hdrlen, buf, len, 0, flags); + return write_loose_object(oid, hdr, hdrlen, &in_stream, 0, flags); } int hash_object_file_literally(const void *buf, unsigned long len, @@ -1977,6 +2006,14 @@ int hash_object_file_literally(const void *buf, unsigned long len, { char *header; int hdrlen, status = 0; + struct input_stream in_stream = { + .read = feed_simple_input_stream, + .data = (void *)&(struct simple_input_stream_data) { + .buf = buf, + .len = len, + }, + .size = len, + }; /* type string, SP, %lu of the length plus NUL must fit this */ hdrlen = strlen(type) + MAX_HEADER_LEN; @@ -1988,7 +2025,7 @@ int hash_object_file_literally(const void *buf, unsigned long len, goto cleanup; if (freshen_packed_object(oid) || freshen_loose_object(oid)) goto cleanup; - status = write_loose_object(oid, header, hdrlen, buf, len, 0, 0); + status = write_loose_object(oid, header, hdrlen, &in_stream, 0, 0); cleanup: free(header); @@ -2003,14 +2040,22 @@ int force_object_loose(const struct object_id *oid, time_t mtime) char hdr[MAX_HEADER_LEN]; int hdrlen; int ret; + struct simple_input_stream_data data; + struct input_stream in_stream = { + .read = feed_simple_input_stream, + .data = &data, + }; if (has_loose_object(oid)) return 0; buf = read_object(the_repository, oid, &type, &len); + in_stream.size = len; if (!buf) return error(_("cannot read object for %s"), oid_to_hex(oid)); + data.buf = buf; + data.len = len; hdrlen = xsnprintf(hdr, sizeof(hdr), "%s %"PRIuMAX , type_name(type), (uintmax_t)len) + 1; - ret = write_loose_object(oid, hdr, hdrlen, buf, len, mtime, 0); + ret = write_loose_object(oid, hdr, hdrlen, &in_stream, mtime, 0); free(buf); return ret; diff --git a/object-store.h b/object-store.h index 952efb6a4b..a84d891d60 100644 --- a/object-store.h +++ b/object-store.h @@ -34,6 +34,12 @@ struct object_directory { char *path; }; +struct input_stream { + const void *(*read)(struct input_stream *, unsigned long *len); + void *data; + size_t size; +}; + KHASH_INIT(odb_path_map, const char * /* key: odb_path */, struct object_directory *, 1, fspathhash, fspatheq)