Message ID | 9bd273f8c94fdb0c3adf8aedef3480ff5f4232b8.1611676886.git.gitgitgadget@gmail.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | Refactor chunk-format into an API | expand |
On Tue, Jan 26, 2021 at 04:01:11PM +0000, Derrick Stolee via GitGitGadget wrote: > +/* > + * When writing a chunk-based file format, collect the chunks in > + * an array of chunk_info structs. The size stores the _expected_ > + * amount of data that will be written by write_fn. > + */ > +struct chunk_info { > + uint32_t id; > + uint64_t size; Hmm. Would we not want an off_t to indicate the size here? I wondered briefly if we even needed a size field at all, since calling write_fn would tell us the number of bytes written. But I suppose you want to know ahead of time so that you can write the file in one pass (beginning with the table of contents, which certainly needs to know the size). > + /* Trailing entry marks the end of the chunks */ > + hashwrite_be32(cf->f, 0); > + hashwrite_be64(cf->f, cur_offset); > + > + for (i = 0; i < cf->chunks_nr; i++) { > + uint64_t start_offset = cf->f->total + cf->f->offset; > + int result = cf->chunks[i].write_fn(cf->f, data); > + > + if (result) > + return result; > + > + if (cf->f->total + cf->f->offset != start_offset + cf->chunks[i].size) I don't think this is a practical concern, but a malicious caller could overflow this by passing a bogus "size" parameter. Maybe: uint64_t end_offset = ...; if (end_offset - start_offset != cf->chunks[i].size) BUG(...) ? > diff --git a/chunk-format.h b/chunk-format.h > new file mode 100644 > index 00000000000..bfaed672813 > --- /dev/null > +++ b/chunk-format.h > @@ -0,0 +1,20 @@ > +#ifndef CHUNK_FORMAT_H > +#define CHUNK_FORMAT_H > + > +#include "git-compat-util.h" > + > +struct hashfile; > +struct chunkfile; > + > +struct chunkfile *init_chunkfile(struct hashfile *f); > +void free_chunkfile(struct chunkfile *cf); > +int get_num_chunks(struct chunkfile *cf); > +typedef int (*chunk_write_fn)(struct hashfile *f, > + void *data); > +void add_chunk(struct chunkfile *cf, > + uint64_t id, > + chunk_write_fn fn, > + size_t size); > +int write_chunkfile(struct chunkfile *cf, void *data); Very clean API. Thanks, Taylor
On 1/26/2021 9:42 PM, Taylor Blau wrote: > On Tue, Jan 26, 2021 at 04:01:11PM +0000, Derrick Stolee via GitGitGadget wrote: >> +/* >> + * When writing a chunk-based file format, collect the chunks in >> + * an array of chunk_info structs. The size stores the _expected_ >> + * amount of data that will be written by write_fn. >> + */ >> +struct chunk_info { >> + uint32_t id; >> + uint64_t size; > > Hmm. Would we not want an off_t to indicate the size here? > > I wondered briefly if we even needed a size field at all, since calling > write_fn would tell us the number of bytes written. But I suppose you > want to know ahead of time so that you can write the file in one pass > (beginning with the table of contents, which certainly needs to know the > size). Is off_t 64-bits on a 32-bit machine? This is intentionally typed to be "64 bits no matter what" because it correlates with the file format's size for the chunk offsets. >> + if (cf->f->total + cf->f->offset != start_offset + cf->chunks[i].size) > > I don't think this is a practical concern, but a malicious caller could > overflow this by passing a bogus "size" parameter. Maybe: > > uint64_t end_offset = ...; > > if (end_offset - start_offset != cf->chunks[i].size) > BUG(...) Sure. Thanks, -Stolee
diff --git a/Makefile b/Makefile index 7b64106930a..50a7663841e 100644 --- a/Makefile +++ b/Makefile @@ -854,6 +854,7 @@ LIB_OBJS += bundle.o LIB_OBJS += cache-tree.o LIB_OBJS += chdir-notify.o LIB_OBJS += checkout.o +LIB_OBJS += chunk-format.o LIB_OBJS += color.o LIB_OBJS += column.o LIB_OBJS += combine-diff.o diff --git a/chunk-format.c b/chunk-format.c new file mode 100644 index 00000000000..2ce37ecc6bb --- /dev/null +++ b/chunk-format.c @@ -0,0 +1,91 @@ +#include "cache.h" +#include "chunk-format.h" +#include "csum-file.h" +#define CHUNK_LOOKUP_WIDTH 12 + +/* + * When writing a chunk-based file format, collect the chunks in + * an array of chunk_info structs. The size stores the _expected_ + * amount of data that will be written by write_fn. + */ +struct chunk_info { + uint32_t id; + uint64_t size; + chunk_write_fn write_fn; +}; + +struct chunkfile { + struct hashfile *f; + + struct chunk_info *chunks; + size_t chunks_nr; + size_t chunks_alloc; +}; + +struct chunkfile *init_chunkfile(struct hashfile *f) +{ + struct chunkfile *cf = xcalloc(1, sizeof(*cf)); + cf->f = f; + return cf; +} + +void free_chunkfile(struct chunkfile *cf) +{ + if (!cf) + return; + free(cf->chunks); + free(cf); +} + +int get_num_chunks(struct chunkfile *cf) +{ + return cf->chunks_nr; +} + +void add_chunk(struct chunkfile *cf, + uint64_t id, + chunk_write_fn fn, + size_t size) +{ + ALLOC_GROW(cf->chunks, cf->chunks_nr + 1, cf->chunks_alloc); + + cf->chunks[cf->chunks_nr].id = id; + cf->chunks[cf->chunks_nr].write_fn = fn; + cf->chunks[cf->chunks_nr].size = size; + cf->chunks_nr++; +} + +int write_chunkfile(struct chunkfile *cf, void *data) +{ + int i; + size_t cur_offset = cf->f->offset + cf->f->total; + + /* Add the table of contents to the current offset */ + cur_offset += (cf->chunks_nr + 1) * CHUNK_LOOKUP_WIDTH; + + for (i = 0; i < cf->chunks_nr; i++) { + hashwrite_be32(cf->f, cf->chunks[i].id); + hashwrite_be64(cf->f, cur_offset); + + cur_offset += cf->chunks[i].size; + } + + /* Trailing entry marks the end of the chunks */ + hashwrite_be32(cf->f, 0); + hashwrite_be64(cf->f, cur_offset); + + for (i = 0; i < cf->chunks_nr; i++) { + uint64_t start_offset = cf->f->total + cf->f->offset; + int result = cf->chunks[i].write_fn(cf->f, data); + + if (result) + return result; + + if (cf->f->total + cf->f->offset != start_offset + cf->chunks[i].size) + BUG("expected to write %"PRId64" bytes to chunk %"PRIx32", but wrote %"PRId64" instead", + cf->chunks[i].size, cf->chunks[i].id, + cf->f->total + cf->f->offset - start_offset); + } + + return 0; +} diff --git a/chunk-format.h b/chunk-format.h new file mode 100644 index 00000000000..bfaed672813 --- /dev/null +++ b/chunk-format.h @@ -0,0 +1,20 @@ +#ifndef CHUNK_FORMAT_H +#define CHUNK_FORMAT_H + +#include "git-compat-util.h" + +struct hashfile; +struct chunkfile; + +struct chunkfile *init_chunkfile(struct hashfile *f); +void free_chunkfile(struct chunkfile *cf); +int get_num_chunks(struct chunkfile *cf); +typedef int (*chunk_write_fn)(struct hashfile *f, + void *data); +void add_chunk(struct chunkfile *cf, + uint64_t id, + chunk_write_fn fn, + size_t size); +int write_chunkfile(struct chunkfile *cf, void *data); + +#endif