diff mbox series

[v3,05/10] bulk-checkin: extract abstract `bulk_checkin_source`

Message ID da52ec838025a59a3f4f4ffaf2e6f9098a37547e.1697648864.git.me@ttaylorr.com (mailing list archive)
State Superseded
Headers show
Series merge-ort: implement support for packing objects together | expand

Commit Message

Taylor Blau Oct. 18, 2023, 5:08 p.m. UTC
A future commit will want to implement a very similar routine as in
`stream_blob_to_pack()` with two notable changes:

  - Instead of streaming just OBJ_BLOBs, this new function may want to
    stream objects of arbitrary type.

  - Instead of streaming the object's contents from an open
    file-descriptor, this new function may want to "stream" its contents
    from memory.

To avoid duplicating a significant chunk of code between the existing
`stream_blob_to_pack()`, extract an abstract `bulk_checkin_source`. This
concept currently is a thin layer of `lseek()` and `read_in_full()`, but
will grow to understand how to perform analogous operations when writing
out an object's contents from memory.

Suggested-by: Junio C Hamano <gitster@pobox.com>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
---
 bulk-checkin.c | 61 +++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 53 insertions(+), 8 deletions(-)

Comments

Junio C Hamano Oct. 18, 2023, 11:10 p.m. UTC | #1
Taylor Blau <me@ttaylorr.com> writes:

> A future commit will want to implement a very similar routine as in
> `stream_blob_to_pack()` with two notable changes:
>
>   - Instead of streaming just OBJ_BLOBs, this new function may want to
>     stream objects of arbitrary type.
>
>   - Instead of streaming the object's contents from an open
>     file-descriptor, this new function may want to "stream" its contents
>     from memory.
>
> To avoid duplicating a significant chunk of code between the existing
> `stream_blob_to_pack()`, extract an abstract `bulk_checkin_source`. This
> concept currently is a thin layer of `lseek()` and `read_in_full()`, but
> will grow to understand how to perform analogous operations when writing
> out an object's contents from memory.
>
> Suggested-by: Junio C Hamano <gitster@pobox.com>
> Signed-off-by: Taylor Blau <me@ttaylorr.com>
> ---
>  bulk-checkin.c | 61 +++++++++++++++++++++++++++++++++++++++++++-------
>  1 file changed, 53 insertions(+), 8 deletions(-)

> diff --git a/bulk-checkin.c b/bulk-checkin.c
> index f4914fb6d1..fc1d902018 100644
> --- a/bulk-checkin.c
> +++ b/bulk-checkin.c
> @@ -140,8 +140,41 @@ static int already_written(struct bulk_checkin_packfile *state, struct object_id
>  	return 0;
>  }
>  
> +struct bulk_checkin_source {
> +	enum { SOURCE_FILE } type;
> +
> +	/* SOURCE_FILE fields */
> +	int fd;
> +
> +	/* common fields */
> +	size_t size;
> +	const char *path;
> +};

Looks OK, even though I expected to see a bit more involved object
orientation with something like

	struct bulk_checkin_source {
		off_t (*read)(struct bulk_checkin_source *, void *, size_t);
		off_t (*seek)(struct bulk_checkin_source *, off_t);
		union {
			struct {
				int fd;
				size_t size;
				const char *path;
			} from_fd;
			struct {
				...
			} incore;
		} data;
	};

As there will only be two subclasses of this thing, it may not
matter all that much right now, but it would be much nicer as your
methods do not have to care about "switch (enum) { case FILE: ... }".
Taylor Blau Oct. 19, 2023, 3:19 p.m. UTC | #2
On Wed, Oct 18, 2023 at 04:10:43PM -0700, Junio C Hamano wrote:
> Looks OK, even though I expected to see a bit more involved object
> orientation with something like
>
> 	struct bulk_checkin_source {
> 		off_t (*read)(struct bulk_checkin_source *, void *, size_t);
> 		off_t (*seek)(struct bulk_checkin_source *, off_t);
> 		union {
> 			struct {
> 				int fd;
> 				size_t size;
> 				const char *path;
> 			} from_fd;
> 			struct {
> 				...
> 			} incore;
> 		} data;
> 	};
>
> As there will only be two subclasses of this thing, it may not
> matter all that much right now, but it would be much nicer as your
> methods do not have to care about "switch (enum) { case FILE: ... }".

I want to be cautious of going too far in this direction. I anticipate
that "two" is probably the maximum number of kinds of sources we can
reasonably expect for the foreseeable future. If that changes, it's easy
enough to convert from the existing implementation to something closer
to the above.

Thanks,
Taylor
Junio C Hamano Oct. 19, 2023, 5:55 p.m. UTC | #3
Taylor Blau <me@ttaylorr.com> writes:

> I want to be cautious of going too far in this direction.

That's fine.  Thanks.
diff mbox series

Patch

diff --git a/bulk-checkin.c b/bulk-checkin.c
index f4914fb6d1..fc1d902018 100644
--- a/bulk-checkin.c
+++ b/bulk-checkin.c
@@ -140,8 +140,41 @@  static int already_written(struct bulk_checkin_packfile *state, struct object_id
 	return 0;
 }
 
+struct bulk_checkin_source {
+	enum { SOURCE_FILE } type;
+
+	/* SOURCE_FILE fields */
+	int fd;
+
+	/* common fields */
+	size_t size;
+	const char *path;
+};
+
+static off_t bulk_checkin_source_seek_to(struct bulk_checkin_source *source,
+					 off_t offset)
+{
+	switch (source->type) {
+	case SOURCE_FILE:
+		return lseek(source->fd, offset, SEEK_SET);
+	default:
+		BUG("unknown bulk-checkin source: %d", source->type);
+	}
+}
+
+static ssize_t bulk_checkin_source_read(struct bulk_checkin_source *source,
+					void *buf, size_t nr)
+{
+	switch (source->type) {
+	case SOURCE_FILE:
+		return read_in_full(source->fd, buf, nr);
+	default:
+		BUG("unknown bulk-checkin source: %d", source->type);
+	}
+}
+
 /*
- * Read the contents from fd for size bytes, streaming it to the
+ * Read the contents from 'source' for 'size' bytes, streaming it to the
  * packfile in state while updating the hash in ctx. Signal a failure
  * by returning a negative value when the resulting pack would exceed
  * the pack size limit and this is not the first object in the pack,
@@ -157,7 +190,7 @@  static int already_written(struct bulk_checkin_packfile *state, struct object_id
  */
 static int stream_blob_to_pack(struct bulk_checkin_packfile *state,
 			       git_hash_ctx *ctx, off_t *already_hashed_to,
-			       int fd, size_t size, const char *path,
+			       struct bulk_checkin_source *source,
 			       unsigned flags)
 {
 	git_zstream s;
@@ -167,22 +200,28 @@  static int stream_blob_to_pack(struct bulk_checkin_packfile *state,
 	int status = Z_OK;
 	int write_object = (flags & HASH_WRITE_OBJECT);
 	off_t offset = 0;
+	size_t size = source->size;
 
 	git_deflate_init(&s, pack_compression_level);
 
-	hdrlen = encode_in_pack_object_header(obuf, sizeof(obuf), OBJ_BLOB, size);
+	hdrlen = encode_in_pack_object_header(obuf, sizeof(obuf), OBJ_BLOB,
+					      size);
 	s.next_out = obuf + hdrlen;
 	s.avail_out = sizeof(obuf) - hdrlen;
 
 	while (status != Z_STREAM_END) {
 		if (size && !s.avail_in) {
 			ssize_t rsize = size < sizeof(ibuf) ? size : sizeof(ibuf);
-			ssize_t read_result = read_in_full(fd, ibuf, rsize);
+			ssize_t read_result;
+
+			read_result = bulk_checkin_source_read(source, ibuf,
+							       rsize);
 			if (read_result < 0)
-				die_errno("failed to read from '%s'", path);
+				die_errno("failed to read from '%s'",
+					  source->path);
 			if (read_result != rsize)
 				die("failed to read %d bytes from '%s'",
-				    (int)rsize, path);
+				    (int)rsize, source->path);
 			offset += rsize;
 			if (*already_hashed_to < offset) {
 				size_t hsize = offset - *already_hashed_to;
@@ -325,6 +364,12 @@  static int deflate_blob_to_pack(struct bulk_checkin_packfile *state,
 	git_hash_ctx ctx;
 	struct hashfile_checkpoint checkpoint = {0};
 	struct pack_idx_entry *idx = NULL;
+	struct bulk_checkin_source source = {
+		.type = SOURCE_FILE,
+		.fd = fd,
+		.size = size,
+		.path = path,
+	};
 
 	seekback = lseek(fd, 0, SEEK_CUR);
 	if (seekback == (off_t) -1)
@@ -342,10 +387,10 @@  static int deflate_blob_to_pack(struct bulk_checkin_packfile *state,
 	while (1) {
 		prepare_checkpoint(state, &checkpoint, idx, flags);
 		if (!stream_blob_to_pack(state, &ctx, &already_hashed_to,
-					 fd, size, path, flags))
+					 &source, flags))
 			break;
 		truncate_checkpoint(state, &checkpoint, idx);
-		if (lseek(fd, seekback, SEEK_SET) == (off_t) -1)
+		if (bulk_checkin_source_seek_to(&source, seekback) == (off_t)-1)
 			return error("cannot seek back");
 	}
 	finalize_checkpoint(state, &ctx, &checkpoint, idx, result_oid);