diff mbox series

[v7,3/5] object-file.c: refactor write_loose_object() to reuse in stream version

Message ID 20211221115201.12120-4-chiyutianyi@gmail.com (mailing list archive)
State Superseded
Headers show
Series unpack large blobs in stream | expand

Commit Message

Han Xin Dec. 21, 2021, 11:51 a.m. UTC
From: Han Xin <hanxin.hx@alibaba-inc.com>

We used to call "get_data()" in "unpack_non_delta_entry()" to read the
entire contents of a blob object, no matter how big it is. This
implementation may consume all the memory and cause OOM.

This can be improved by feeding data to "stream_loose_object()" in
stream instead of read into the whole buf.

As this new method "stream_loose_object()" has many similarities with
"write_loose_object()", we split up "write_loose_object()" into some
steps:
 1. Figuring out a path for the (temp) object file.
 2. Creating the tempfile.
 3. Setting up zlib and write header.
 4. Write object data and handle errors.
 5. Optionally, do someting after write, maybe force a loose object if
"mtime".

Helped-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Han Xin <hanxin.hx@alibaba-inc.com>
---
 object-file.c | 100 ++++++++++++++++++++++++++++++++------------------
 1 file changed, 65 insertions(+), 35 deletions(-)

Comments

Ævar Arnfjörð Bjarmason Dec. 21, 2021, 2:16 p.m. UTC | #1
On Tue, Dec 21 2021, Han Xin wrote:

> From: Han Xin <hanxin.hx@alibaba-inc.com>
> [...]
> @@ -1854,17 +1876,48 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
>  		strbuf_reset(tmp);
>  		strbuf_add(tmp, filename, dirlen - 1);
>  		if (mkdir(tmp->buf, 0777) && errno != EEXIST)
> -			return -1;
> +			break;
>  		if (adjust_shared_perm(tmp->buf))
> -			return -1;
> +			break;
>  
>  		/* Try again */
>  		strbuf_addstr(tmp, "/tmp_obj_XXXXXX");
>  		fd = git_mkstemp_mode(tmp->buf, 0444);
> +	} while (0);
> +
> +	if (fd < 0 && !(flags & HASH_SILENT)) {
> +		if (errno == EACCES)
> +			return error(_("insufficient permission for adding an "
> +				       "object to repository database %s"),
> +				     get_object_directory());

This should be an error_errno() instead, ...

> +		else
> +			return error_errno(_("unable to create temporary file"));

...and we can just fold this whole if/else into one condition with a
briefer message, e.g.:

    error_errno(_("unable to add object to '%s'"), get_object_directory());

Or whatever, unless there's another bug here where you inverted these
conditions, and the "else" really should not use "error_errno" but
"error".... (I don't know...)
Jiang Xin Dec. 22, 2021, 12:02 p.m. UTC | #2
On Wed, Dec 22, 2021 at 8:40 AM Ævar Arnfjörð Bjarmason
<avarab@gmail.com> wrote:
>
>
> On Tue, Dec 21 2021, Han Xin wrote:
>
> > From: Han Xin <hanxin.hx@alibaba-inc.com>
> > [...]
> > @@ -1854,17 +1876,48 @@ static int create_tmpfile(struct strbuf *tmp, const char *filename)
> >               strbuf_reset(tmp);
> >               strbuf_add(tmp, filename, dirlen - 1);
> >               if (mkdir(tmp->buf, 0777) && errno != EEXIST)
> > -                     return -1;
> > +                     break;
> >               if (adjust_shared_perm(tmp->buf))
> > -                     return -1;
> > +                     break;
> >
> >               /* Try again */
> >               strbuf_addstr(tmp, "/tmp_obj_XXXXXX");
> >               fd = git_mkstemp_mode(tmp->buf, 0444);
> > +     } while (0);
> > +
> > +     if (fd < 0 && !(flags & HASH_SILENT)) {
> > +             if (errno == EACCES)
> > +                     return error(_("insufficient permission for adding an "
> > +                                    "object to repository database %s"),
> > +                                  get_object_directory());
>
> This should be an error_errno() instead, ...

We already know the errno (EACCESS) and output a decent error message,
so using error() is OK.  BTW, it's just a refactor by copy & paste.

>
> > +             else
> > +                     return error_errno(_("unable to create temporary file"));
>
> ...and we can just fold this whole if/else into one condition with a
> briefer message, e.g.:
>
>     error_errno(_("unable to add object to '%s'"), get_object_directory());
>
> Or whatever, unless there's another bug here where you inverted these
> conditions, and the "else" really should not use "error_errno" but
> "error".... (I don't know...)
diff mbox series

Patch

diff --git a/object-file.c b/object-file.c
index 6bba4766f9..e048f3d39e 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1751,6 +1751,25 @@  static void write_object_file_prepare(const struct git_hash_algo *algo,
 	algo->final_oid_fn(oid, &c);
 }
 
+/*
+ * Move the just written object with proper mtime into its final resting place.
+ */
+static int finalize_object_file_with_mtime(const char *tmpfile,
+					   const char *filename,
+					   time_t mtime,
+					   unsigned flags)
+{
+	struct utimbuf utb;
+
+	if (mtime) {
+		utb.actime = mtime;
+		utb.modtime = mtime;
+		if (utime(tmpfile, &utb) < 0 && !(flags & HASH_SILENT))
+			warning_errno(_("failed utime() on %s"), tmpfile);
+	}
+	return finalize_object_file(tmpfile, filename);
+}
+
 /*
  * Move the just written object into its final resting place.
  */
@@ -1836,7 +1855,8 @@  static inline int directory_size(const char *filename)
  * We want to avoid cross-directory filename renames, because those
  * can have problems on various filesystems (FAT, NFS, Coda).
  */
-static int create_tmpfile(struct strbuf *tmp, const char *filename)
+static int create_tmpfile(struct strbuf *tmp, const char *filename,
+			  unsigned flags)
 {
 	int fd, dirlen = directory_size(filename);
 
@@ -1844,7 +1864,9 @@  static int create_tmpfile(struct strbuf *tmp, const char *filename)
 	strbuf_add(tmp, filename, dirlen);
 	strbuf_addstr(tmp, "tmp_obj_XXXXXX");
 	fd = git_mkstemp_mode(tmp->buf, 0444);
-	if (fd < 0 && dirlen && errno == ENOENT) {
+	do {
+		if (fd >= 0 || !dirlen || errno != ENOENT)
+			break;
 		/*
 		 * Make sure the directory exists; note that the contents
 		 * of the buffer are undefined after mkstemp returns an
@@ -1854,17 +1876,48 @@  static int create_tmpfile(struct strbuf *tmp, const char *filename)
 		strbuf_reset(tmp);
 		strbuf_add(tmp, filename, dirlen - 1);
 		if (mkdir(tmp->buf, 0777) && errno != EEXIST)
-			return -1;
+			break;
 		if (adjust_shared_perm(tmp->buf))
-			return -1;
+			break;
 
 		/* Try again */
 		strbuf_addstr(tmp, "/tmp_obj_XXXXXX");
 		fd = git_mkstemp_mode(tmp->buf, 0444);
+	} while (0);
+
+	if (fd < 0 && !(flags & HASH_SILENT)) {
+		if (errno == EACCES)
+			return error(_("insufficient permission for adding an "
+				       "object to repository database %s"),
+				     get_object_directory());
+		else
+			return error_errno(_("unable to create temporary file"));
 	}
+
 	return fd;
 }
 
+static void setup_stream_and_header(git_zstream *stream,
+				    unsigned char *compressed,
+				    unsigned long compressed_size,
+				    git_hash_ctx *c,
+				    char *hdr,
+				    int hdrlen)
+{
+	/* Set it up */
+	git_deflate_init(stream, zlib_compression_level);
+	stream->next_out = compressed;
+	stream->avail_out = compressed_size;
+	the_hash_algo->init_fn(c);
+
+	/* First header.. */
+	stream->next_in = (unsigned char *)hdr;
+	stream->avail_in = hdrlen;
+	while (git_deflate(stream, 0) == Z_OK)
+		; /* nothing */
+	the_hash_algo->update_fn(c, hdr, hdrlen);
+}
+
 static int write_loose_object(const struct object_id *oid, char *hdr,
 			      int hdrlen, const void *buf, unsigned long len,
 			      time_t mtime, unsigned flags)
@@ -1879,28 +1932,13 @@  static int write_loose_object(const struct object_id *oid, char *hdr,
 
 	loose_object_path(the_repository, &filename, oid);
 
-	fd = create_tmpfile(&tmp_file, filename.buf);
-	if (fd < 0) {
-		if (flags & HASH_SILENT)
-			return -1;
-		else if (errno == EACCES)
-			return error(_("insufficient permission for adding an object to repository database %s"), get_object_directory());
-		else
-			return error_errno(_("unable to create temporary file"));
-	}
-
-	/* Set it up */
-	git_deflate_init(&stream, zlib_compression_level);
-	stream.next_out = compressed;
-	stream.avail_out = sizeof(compressed);
-	the_hash_algo->init_fn(&c);
+	fd = create_tmpfile(&tmp_file, filename.buf, flags);
+	if (fd < 0)
+		return -1;
 
-	/* First header.. */
-	stream.next_in = (unsigned char *)hdr;
-	stream.avail_in = hdrlen;
-	while (git_deflate(&stream, 0) == Z_OK)
-		; /* nothing */
-	the_hash_algo->update_fn(&c, hdr, hdrlen);
+	/* Set it up and write header */
+	setup_stream_and_header(&stream, compressed, sizeof(compressed),
+				&c, hdr, hdrlen);
 
 	/* Then the data itself.. */
 	stream.next_in = (void *)buf;
@@ -1929,16 +1967,8 @@  static int write_loose_object(const struct object_id *oid, char *hdr,
 
 	close_loose_object(fd);
 
-	if (mtime) {
-		struct utimbuf utb;
-		utb.actime = mtime;
-		utb.modtime = mtime;
-		if (utime(tmp_file.buf, &utb) < 0 &&
-		    !(flags & HASH_SILENT))
-			warning_errno(_("failed utime() on %s"), tmp_file.buf);
-	}
-
-	return finalize_object_file(tmp_file.buf, filename.buf);
+	return finalize_object_file_with_mtime(tmp_file.buf, filename.buf,
+					       mtime, flags);
 }
 
 static int freshen_loose_object(const struct object_id *oid)